From e76e4fe94e4d84167e4bb3b4d8de8c7b05dd9447 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 03:30:15 +0300 Subject: [PATCH 01/33] feat(resourcecontext): add Build generator + wire /api/ai/resources GET (T6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the basic-tier resourceContext builder and wires it into the /api/ai/resources/{kind}/{ns}/{name} GET handler. The package layer is pure-Go and depends only on pkg/topology — callers in internal/* pre- compute IssueSummary and AuditSummary so this package doesn't reach into internal/issues or internal/audit. What ships: - pkg/resourcecontext/build.go: Build(ctx, obj, opts) walks topology relationships + pod spec to populate ManagedBy (Argo tracking-id / Flux labels / owner ref), Exposes, SelectedBy (PDB + NetworkPolicy split), Uses (ConfigMaps/Secrets/PVCs/SA), RunsOn, ScaledBy, and PolicySummary. Every ContextRef passes through opts.AccessChecker; denials are dropped and recorded in Omitted with rbac_denied reason. - pkg/resourcecontext/hints.go: SynthesizeHints renders 5–8 short, deterministic prose lines for AI consumers. EmitHints=false on UI callers leaves Hints empty. - internal/server/rc_rbac.go: requestScopedChecker wraps Server.canRead with a per-request (verb,group,kind,namespace) memoization layer. Collapses the ~30 ref-checks per response into ~5 SAR calls without changing the underlying PermissionCache TTL. - internal/server/ai_handlers.go: handleAIGetResource now returns { resource, resourceContext } by default. ?context=none keeps the pre-T6 bare shape. IssueSummary uses issues.ComposeWithStats scoped to the resource; AuditSummary uses pkg/audit.IndexByResource. handleAIListResources is unchanged — T8/T9 territory. Golden tests cover Pod (full enrichment + RBAC denial + JSON shape), Deployment (Flux HelmRelease label precedence), Service (Ingress Exposes), NetworkPolicy (outgoing EdgeProtects intentionally not surfaced), ConfigMap (owner-chain only), PolicyReports rollup, HPA identity, and the EmitHints=false path. Hints determinism is pinned by a separate golden in hints_test.go. --- internal/server/ai_handlers.go | 294 ++++++++++-- internal/server/rc_rbac.go | 88 ++++ pkg/resourcecontext/build.go | 722 ++++++++++++++++++++++++++++++ pkg/resourcecontext/build_test.go | 590 ++++++++++++++++++++++++ pkg/resourcecontext/hints.go | 232 ++++++++++ pkg/resourcecontext/hints_test.go | 118 +++++ 6 files changed, 2013 insertions(+), 31 deletions(-) create mode 100644 internal/server/rc_rbac.go create mode 100644 pkg/resourcecontext/build.go create mode 100644 pkg/resourcecontext/build_test.go create mode 100644 pkg/resourcecontext/hints.go create mode 100644 pkg/resourcecontext/hints_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ea130bac..4db391ef2 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -1,15 +1,22 @@ package server import ( + "context" "fmt" "net/http" "strings" "github.com/go-chi/chi/v5" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/audit" + "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" ) // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. @@ -110,8 +117,23 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 s.writeJSON(w, results) } -// handleAIGetResource returns a single minified resource for AI consumption. -// GET /api/ai/resources/{kind}/{namespace}/{name}?group=X&verbosity=summary|detail|compact +// handleAIGetResource returns a single minified resource for AI consumption, +// wrapped with a resourceContext enrichment block by default. +// +// GET /api/ai/resources/{kind}/{namespace}/{name} +// +// Query params: +// - group=X API group disambiguator for CRDs. +// - verbosity=... summary | detail | compact (default: detail). +// - context=none Skip resourceContext build, return bare minified resource. +// +// Response shape (default): +// +// { "resource": , "resourceContext": { ...basic tier... } } +// +// Response shape (context=none): +// +// func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return @@ -121,6 +143,7 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") group := r.URL.Query().Get("group") level := parseVerbosity(r, aicontext.LevelDetail) + skipContext := r.URL.Query().Get("context") == "none" // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace if namespace == "_" { @@ -133,41 +156,250 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { return } - // Try typed cache first - obj, err := k8s.FetchResource(cache, kind, namespace, name) - if err == k8s.ErrUnknownKind { - // Fall through to dynamic cache for CRDs - u, dynErr := cache.GetDynamicWithGroup(r.Context(), kind, namespace, name, group) - if dynErr != nil { - if strings.Contains(dynErr.Error(), "unknown resource kind") { - s.writeError(w, http.StatusBadRequest, dynErr.Error()) - return - } - if strings.Contains(dynErr.Error(), "not found") { - s.writeError(w, http.StatusNotFound, dynErr.Error()) - return - } - s.writeError(w, http.StatusInternalServerError, dynErr.Error()) - return - } - s.writeJSON(w, aicontext.MinifyUnstructured(u, level)) - return - } + obj, isUnstructured, err := s.fetchAIResource(r.Context(), cache, kind, namespace, name, group) if err != nil { - if strings.HasPrefix(err.Error(), "forbidden:") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) - return - } - s.writeError(w, http.StatusNotFound, err.Error()) + s.writeAIFetchError(w, kind, err) return } - k8s.SetTypeMeta(obj) - result, err := aicontext.Minify(obj, level) + if !isUnstructured { + k8s.SetTypeMeta(obj) + } + + minified, err := minifyForAI(obj, isUnstructured, level) if err != nil { s.writeError(w, http.StatusInternalServerError, err.Error()) return } - s.writeJSON(w, result) + if skipContext { + s.writeJSON(w, minified) + return + } + + rc := s.buildAIResourceContext(r, obj, kind, namespace, name) + s.writeJSON(w, map[string]any{ + "resource": minified, + "resourceContext": rc, + }) +} + +// fetchAIResource resolves the resource from the typed cache or dynamic cache. +// The bool reports whether the returned object is an unstructured (CRD) value. +func (s *Server) fetchAIResource(ctx context.Context, cache *k8s.ResourceCache, kind, namespace, name, group string) (runtime.Object, bool, error) { + obj, err := k8s.FetchResource(cache, kind, namespace, name) + if err == nil { + return obj, false, nil + } + if err != k8s.ErrUnknownKind { + return nil, false, err + } + u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if dynErr != nil { + return nil, false, dynErr + } + return u, true, nil +} + +// writeAIFetchError maps fetch errors to HTTP status codes. Mirrors the +// previous inline behavior so consumers don't see a status-code drift. +func (s *Server) writeAIFetchError(w http.ResponseWriter, kind string, err error) { + msg := err.Error() + switch { + case strings.HasPrefix(msg, "forbidden:"): + s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) + case strings.Contains(msg, "unknown resource kind"): + s.writeError(w, http.StatusBadRequest, msg) + case strings.Contains(msg, "not found"): + s.writeError(w, http.StatusNotFound, msg) + default: + s.writeError(w, http.StatusNotFound, msg) + } +} + +// minifyForAI dispatches to the right Minify variant based on whether the +// resource is unstructured (CRD) or typed. +func minifyForAI(obj runtime.Object, isUnstructured bool, level aicontext.VerbosityLevel) (any, error) { + if isUnstructured { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, fmt.Errorf("internal: object marked unstructured but is %T", obj) + } + return aicontext.MinifyUnstructured(u, level), nil + } + return aicontext.Minify(obj, level) +} + +// buildAIResourceContext assembles the Options struct and calls Build. +// Returns the populated context — never nil unless obj is nil. +func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kind, namespace, name string) *resourcecontext.ResourceContext { + if obj == nil { + return nil + } + cache := k8s.GetResourceCache() + + issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) + auditSum := computeAuditSummaryForResource(cache, namespace, name) + + opts := resourcecontext.Options{ + Tier: resourcecontext.TierBasic, + AccessChecker: s.newRequestScopedChecker(r), + EmitHints: true, + IssueSummary: issueSum, + AuditSummary: auditSum, + } + + if topo, prov, dyn, ok := s.topologyForContext(namespace); ok { + opts.Topology = topo + opts.Provider = prov + opts.DynamicProv = dyn + } + + return resourcecontext.Build(r.Context(), obj, opts) +} + +// topologyForContext builds (or fetches the memoized) topology scoped to the +// resource's namespace. Cluster-scoped resources get an all-namespaces build. +// Returns ok=false when the cache isn't ready yet. +func (s *Server) topologyForContext(namespace string) (*topology.Topology, topology.ResourceProvider, topology.DynamicProvider, bool) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, nil, false + } + opts := topology.DefaultBuildOptions() + if namespace != "" { + opts.Namespaces = []string{namespace} + } + opts.IncludeReplicaSets = true + opts.ForRelationshipCache = true + + provider := k8s.NewTopologyResourceProvider(cache) + dyn := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + topo, err := s.topoMemo.Get(opts, func() (*topology.Topology, error) { + return topology.NewBuilder(provider).WithDynamic(dyn).Build(opts) + }) + if err != nil || topo == nil { + return nil, nil, nil, false + } + return topo, provider, dyn, true +} + +// computeIssueSummaryForResource rolls up per-resource issue-composer rows +// (problem + condition + optional audit) into an IssueSummary. +// +// The composer is the canonical "what's wrong with this resource" surface — +// it merges problem detection (Deployment/DS/etc.), pod-level conditions, +// and generic CRD condition fallback. Filtering to a single (kind, name) +// is done client-side; the composer's native namespace filter restricts the +// scan to the resource's namespace so we don't walk the whole cluster. +// +// Returns nil when no issues match — Build then omits the IssueSummary field. +func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { + if cache == nil { + return nil + } + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + filters := issues.Filters{ + Kinds: []string{kind}, + Limit: issues.MaxLimit, + } + if namespace != "" { + filters.Namespaces = []string{namespace} + } + rows, _ := issues.ComposeWithStats(provider, filters) + + var count int + var topReason string + var topSeverity issues.Severity + bySource := make(map[string]int) + for _, row := range rows { + if row.Name != name { + continue + } + if namespace != "" && row.Namespace != namespace { + continue + } + count++ + bySource[string(row.Source)]++ + if topSeverity == "" || composeSeverityRank(row.Severity) > composeSeverityRank(topSeverity) { + topSeverity = row.Severity + topReason = row.Reason + } + } + if count == 0 { + return nil + } + return &resourcecontext.IssueSummary{ + Count: count, + HighestSeverity: string(topSeverity), + TopReason: topReason, + BySource: bySource, + } +} + +// composeSeverityRank orders issues.Severity for highest-wins rollup. +func composeSeverityRank(s issues.Severity) int { + switch s { + case issues.SeverityCritical: + return 2 + case issues.SeverityWarning: + return 1 + } + return 0 +} + +// computeAuditSummaryForResource looks up audit findings for the subject +// resource. Uses pkg/audit.IndexByResource so the lookup is keyed on the +// canonical (Kind/ns/name) tuple — handles plural→singular normalization +// via the Finding.Kind values written by the check runner. +func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil { + return nil + } + results := audit.RunFromCache(cache, []string{namespace}, nil) + if results == nil || len(results.Findings) == 0 { + return nil + } + idx := bpaudit.IndexByResource(results.Findings) + var match []bpaudit.Finding + for key, fs := range idx { + parts := strings.SplitN(key, "/", 3) + if len(parts) != 3 { + continue + } + if parts[1] == namespace && parts[2] == name { + match = append(match, fs...) + } + } + if len(match) == 0 { + return nil + } + + var topSeverity, topFinding string + for _, f := range match { + if topSeverity == "" || auditSeverityRank(f.Severity) > auditSeverityRank(topSeverity) { + topSeverity = f.Severity + topFinding = f.CheckID + } + } + return &resourcecontext.AuditSummary{ + Count: len(match), + HighestSeverity: topSeverity, + TopFinding: topFinding, + } +} + +// auditSeverityRank orders audit finding severities ("danger" > "warning"). +func auditSeverityRank(s string) int { + switch s { + case bpaudit.SeverityDanger: + return 2 + case bpaudit.SeverityWarning: + return 1 + } + return 0 } diff --git a/internal/server/rc_rbac.go b/internal/server/rc_rbac.go new file mode 100644 index 000000000..13c808055 --- /dev/null +++ b/internal/server/rc_rbac.go @@ -0,0 +1,88 @@ +package server + +import ( + "context" + "net/http" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// requestScopedChecker adapts Server.canRead into resourcecontext.RefAccessChecker +// with a request-local memoization layer keyed on (verb, group, kind, namespace). +// +// A single resourceContext build emits ~30 candidate refs but only ~5 distinct +// (group, kind, namespace) tuples — most workloads point at ConfigMaps and +// Secrets in their own namespace, plus a ServiceAccount and a Node. Caching +// here collapses the SAR fan-out before reaching s.canRead's per-user cache. +// +// The map is intentionally request-scoped (not server-scoped): server-scoped +// caching is already in pkg/auth.PermissionCache (2-min TTL) and reused via +// s.canRead. The per-request layer exists only to deduplicate the burst this +// builder generates within a single response. +type requestScopedChecker struct { + s *Server + req *http.Request + cache map[string]bool +} + +// newRequestScopedChecker returns a checker scoped to a single HTTP request. +// Not safe for concurrent use across requests; each handler invocation MUST +// construct its own checker. +func (s *Server) newRequestScopedChecker(r *http.Request) *requestScopedChecker { + return &requestScopedChecker{ + s: s, + req: r, + cache: make(map[string]bool, 8), + } +} + +// CanRead implements resourcecontext.RefAccessChecker. +// +// Authorization rules: +// - Namespaced kinds: SAR on (verb=get, group, resource, namespace). +// - Cluster-scoped kinds (namespace == ""): SAR on (verb=get, group, resource, ""). +// - Unknown kinds (not in discovery, not in static catalogue) pass through — +// mirrors the rest of the codebase's unknown-kind passthrough semantics. +// This is safe because Build only emits refs whose kinds are known to the +// topology builder (which itself uses discovery); a kind unknown here is a +// temporary discovery-cold state, not a permission bypass vector. +func (c *requestScopedChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + key := "get|" + group + "|" + kind + "|" + namespace + if v, ok := c.cache[key]; ok { + return v + } + + resource := lookupResourceName(kind, group) + if resource == "" { + // Unknown kind — passthrough. See doc comment for rationale. + c.cache[key] = true + return true + } + + allowed := c.s.canRead(c.req, group, resource, namespace, "get") + c.cache[key] = allowed + return allowed +} + +// Compile-time assertion that requestScopedChecker satisfies the contract. +var _ resourcecontext.RefAccessChecker = (*requestScopedChecker)(nil) + +// lookupResourceName resolves a (kind, group) pair to the canonical plural +// resource name used by SubjectAccessReview. Tries the static cluster-only +// catalogue (covers Nodes / ClusterRoles / etc.), then discovery for everything +// else including CRDs. Returns "" when neither path knows the kind. +func lookupResourceName(kind, group string) string { + if kind == "" { + return "" + } + if g, r, ok := k8s.ClusterOnlyKindGVR(kind); ok && (group == "" || group == g) { + return r + } + if disc := k8s.GetResourceDiscovery(); disc != nil { + if ar, ok := disc.GetResourceWithGroup(kind, group); ok { + return ar.Name + } + } + return "" +} diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go new file mode 100644 index 000000000..c3d7374d4 --- /dev/null +++ b/pkg/resourcecontext/build.go @@ -0,0 +1,722 @@ +package resourcecontext + +import ( + "context" + "sort" + "strings" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + rbacv1 "k8s.io/api/rbac/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// Options carries everything Build needs to compute a ResourceContext. +// +// Per the v1 contract, this package depends only on pkg/topology — callers +// in internal/* pre-compute IssueSummary / AuditSummary / PolicyReports and +// pass them in, so we don't reach into internal/issues or internal/audit. +type Options struct { + Tier ContextTier + MaxTokens int // reserved for future budgeting; not enforced in v1 + + // AccessChecker gates every emitted ContextRef. nil = no gating (treat + // as fully authorized — local-kubeconfig / tests). + AccessChecker RefAccessChecker + + // Topology data sources. When Topology is nil, the topology-derived + // fields (Exposes, SelectedBy, ScaledBy) are skipped. + Topology *topology.Topology + Provider topology.ResourceProvider + DynamicProv topology.DynamicProvider + + // Pre-computed summaries — pass-through into the response. + IssueSummary *IssueSummary + AuditSummary *AuditSummary + PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings + + // EmitHints controls whether SynthesizeHints runs over the structured + // fields. AI-facing callers (MCP, /api/ai/*) set true; UI callers false. + EmitHints bool +} + +// PolicyReportLookup is the minimal interface Build needs from the +// PolicyReport index. The concrete index lives in pkg/policyreports. +// +// Build does not import pkg/policyreports directly because callers may +// adapt other policy engines into the same shape. +type PolicyReportLookup interface { + FindingsFor(kind, namespace, name string) []KyvernoFinding +} + +// RefAccessChecker abstracts the RBAC check so this package doesn't import +// any internal/* package. REST and MCP handlers each implement this with a +// request-scoped batch cache (see internal/server/rc_rbac.go). +// +// Implementations should treat (group, kind, namespace) as the cache key — +// per-name SAR has no upside since RBAC is namespace-granular. +type RefAccessChecker interface { + CanRead(ctx context.Context, group, kind, namespace string) bool +} + +// Build produces a ResourceContext for obj at the requested tier. +// +// Returns nil when obj is nil. Returns a zero-value (.Tier-only) +// ResourceContext when obj is recognized but no enrichment fields apply. +// Never panics on nil sub-fields of opts. +func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceContext { + if obj == nil { + return nil + } + + ident, ok := identityOf(obj) + if !ok { + return &ResourceContext{Tier: opts.Tier} + } + + rc := &ResourceContext{Tier: opts.Tier} + omitted := newOmittedTracker() + + // 1. ManagedBy — owner chain + GitOps labels/annotations + rc.ManagedBy = filterRefs( + ctx, opts.AccessChecker, + buildManagedBy(ident), + "managedBy", omitted, + ) + + // 2. Topology-derived: Exposes, SelectedBy, ScaledBy + var rel *topology.Relationships + if opts.Topology != nil { + rel = topology.GetRelationships(ident.Kind, ident.Namespace, ident.Name, opts.Topology, opts.Provider, opts.DynamicProv) + } + if rel != nil { + exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) + exposes = append(exposes, rel.Services...) + exposes = append(exposes, rel.Ingresses...) + exposes = append(exposes, rel.Gateways...) + exposes = append(exposes, rel.Routes...) + rc.Exposes = filterRefs(ctx, opts.AccessChecker, + toContextRefs(exposes, ReasonLabelSelector, SourceTopology), + "exposes", omitted) + + selected := make([]topology.ResourceRef, 0, len(rel.PDBs)+len(rel.NetworkPolicies)) + selected = append(selected, rel.PDBs...) + selected = append(selected, rel.NetworkPolicies...) + rc.SelectedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(selected, ReasonPodSelector, SourceTopology), + "selectedBy", omitted) + + rc.ScaledBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(rel.Scalers, ReasonScaleTargetRef, SourceTopology), + "scaledBy", omitted) + } + + // 3. Pod-specific: Uses + RunsOn + if pod, ok := obj.(*corev1.Pod); ok { + rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) + + if pod.Spec.NodeName != "" { + candidate := &ContextRef{ + Kind: "Node", + Name: pod.Spec.NodeName, + Reason: ReasonNodeName, + Source: SourceK8sSpec, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.RunsOn = candidate + } else { + omitted.add("runsOn", OmittedRBACDenied) + } + } + } + + // 4. Pre-computed summaries — pass-through. + rc.IssueSummary = opts.IssueSummary + rc.AuditSummary = opts.AuditSummary + + // 5. PolicyReports — Kyverno findings rolled up. + if opts.PolicyReports != nil { + findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) + if len(findings) > 0 { + rc.PolicySummary = buildPolicySummary(findings) + } + } + + // 6. Hints — AI-only. + if opts.EmitHints { + rc.Hints = SynthesizeHints(rc, opts.Tier) + } + + rc.Omitted = omitted.collect() + return rc +} + +// --------------------------------------------------------------------------- +// Identity extraction +// --------------------------------------------------------------------------- + +// resourceIdentity is the projection of obj that Build needs without holding +// on to the full runtime.Object. Owner refs and labels feed ManagedBy; the +// (Kind, Namespace, Name) tuple keys topology + summary lookups. +type resourceIdentity struct { + Kind string + Group string + Namespace string + Name string + Labels map[string]string + Annotations map[string]string + Owners []metav1.OwnerReference +} + +// identityOf extracts identity from a typed K8s object or unstructured. +// Returns (_, false) for unknown shapes so callers can short-circuit. +func identityOf(obj runtime.Object) (resourceIdentity, bool) { + if obj == nil { + return resourceIdentity{}, false + } + switch v := obj.(type) { + case *corev1.Pod: + return identFromMeta("Pod", "", &v.ObjectMeta), true + case *corev1.Service: + return identFromMeta("Service", "", &v.ObjectMeta), true + case *corev1.ConfigMap: + return identFromMeta("ConfigMap", "", &v.ObjectMeta), true + case *corev1.Secret: + return identFromMeta("Secret", "", &v.ObjectMeta), true + case *corev1.Node: + return identFromMeta("Node", "", &v.ObjectMeta), true + case *corev1.Namespace: + return identFromMeta("Namespace", "", &v.ObjectMeta), true + case *corev1.PersistentVolume: + return identFromMeta("PersistentVolume", "", &v.ObjectMeta), true + case *corev1.PersistentVolumeClaim: + return identFromMeta("PersistentVolumeClaim", "", &v.ObjectMeta), true + case *corev1.ServiceAccount: + return identFromMeta("ServiceAccount", "", &v.ObjectMeta), true + case *corev1.Event: + return identFromMeta("Event", "", &v.ObjectMeta), true + case *corev1.LimitRange: + return identFromMeta("LimitRange", "", &v.ObjectMeta), true + case *appsv1.Deployment: + return identFromMeta("Deployment", "apps", &v.ObjectMeta), true + case *appsv1.DaemonSet: + return identFromMeta("DaemonSet", "apps", &v.ObjectMeta), true + case *appsv1.StatefulSet: + return identFromMeta("StatefulSet", "apps", &v.ObjectMeta), true + case *appsv1.ReplicaSet: + return identFromMeta("ReplicaSet", "apps", &v.ObjectMeta), true + case *autoscalingv2.HorizontalPodAutoscaler: + return identFromMeta("HorizontalPodAutoscaler", "autoscaling", &v.ObjectMeta), true + case *batchv1.Job: + return identFromMeta("Job", "batch", &v.ObjectMeta), true + case *batchv1.CronJob: + return identFromMeta("CronJob", "batch", &v.ObjectMeta), true + case *networkingv1.Ingress: + return identFromMeta("Ingress", "networking.k8s.io", &v.ObjectMeta), true + case *networkingv1.NetworkPolicy: + return identFromMeta("NetworkPolicy", "networking.k8s.io", &v.ObjectMeta), true + case *policyv1.PodDisruptionBudget: + return identFromMeta("PodDisruptionBudget", "policy", &v.ObjectMeta), true + case *storagev1.StorageClass: + return identFromMeta("StorageClass", "storage.k8s.io", &v.ObjectMeta), true + case *rbacv1.Role: + return identFromMeta("Role", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRole: + return identFromMeta("ClusterRole", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.RoleBinding: + return identFromMeta("RoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRoleBinding: + return identFromMeta("ClusterRoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *unstructured.Unstructured: + gvk := v.GroupVersionKind() + return resourceIdentity{ + Kind: gvk.Kind, + Group: gvk.Group, + Namespace: v.GetNamespace(), + Name: v.GetName(), + Labels: v.GetLabels(), + Annotations: v.GetAnnotations(), + Owners: v.GetOwnerReferences(), + }, true + } + return resourceIdentity{}, false +} + +func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { + return resourceIdentity{ + Kind: kind, + Group: group, + Namespace: m.Namespace, + Name: m.Name, + Labels: m.Labels, + Annotations: m.Annotations, + Owners: m.OwnerReferences, + } +} + +// --------------------------------------------------------------------------- +// ManagedBy detection +// --------------------------------------------------------------------------- + +// GitOps label/annotation keys — kept in sync with packages/k8s-ui/src/utils/gitops-owner.ts. +const ( + argoTrackingIDAnnotation = "argocd.argoproj.io/tracking-id" + argoInstanceLabel = "argocd.argoproj.io/instance" + fluxKustomizeNameLabel = "kustomize.toolkit.fluxcd.io/name" + fluxKustomizeNSLabel = "kustomize.toolkit.fluxcd.io/namespace" + fluxHelmNameLabel = "helm.toolkit.fluxcd.io/name" + fluxHelmNSLabel = "helm.toolkit.fluxcd.io/namespace" +) + +// buildManagedBy returns the ContextRefs describing what manages this +// resource. Precedence (most-specific wins): +// 1. Flux HelmRelease labels +// 2. Flux Kustomization labels +// 3. Argo tracking-id annotation +// 4. Argo instance label +// 5. First owner reference (controller=true preferred) +// +// Only one path emits today — the field is a slice so future taxonomies +// (e.g. dual ArgoCD + Flux) can list multiple managers without a wire change. +func buildManagedBy(ident resourceIdentity) []ContextRef { + if name, ns, ok := readPair(ident.Labels, fluxHelmNameLabel, fluxHelmNSLabel); ok { + return []ContextRef{{ + Kind: "HelmRelease", + Group: "helm.toolkit.fluxcd.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + if name, ns, ok := readPair(ident.Labels, fluxKustomizeNameLabel, fluxKustomizeNSLabel); ok { + return []ContextRef{{ + Kind: "Kustomization", + Group: "kustomize.toolkit.fluxcd.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + if id := ident.Annotations[argoTrackingIDAnnotation]; id != "" { + if ns, name, ok := parseArgoTrackingID(id); ok && name != "" { + return []ContextRef{{ + Kind: "Application", + Group: "argoproj.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + } + if inst := ident.Labels[argoInstanceLabel]; inst != "" { + // App namespace unknown without tracking-id — emit with empty ns + // like the UI does; the consumer decides whether to navigate. + return []ContextRef{{ + Kind: "Application", + Group: "argoproj.io", + Name: inst, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + + if owner := pickControllerOwner(ident.Owners); owner != nil { + group := groupFromAPIVersion(owner.APIVersion) + return []ContextRef{{ + Kind: owner.Kind, + Group: group, + Namespace: ident.Namespace, + Name: owner.Name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + return nil +} + +func readPair(m map[string]string, k1, k2 string) (string, string, bool) { + a := m[k1] + b := m[k2] + if a == "" || b == "" { + return "", "", false + } + return a, b, true +} + +// parseArgoTrackingID mirrors gitops-owner.ts. Two forms: +// +// ":..." (legacy, single name) +// "_:..." (namespaced install) +// +// Returns (ns, name, ok). +func parseArgoTrackingID(value string) (string, string, bool) { + colon := strings.IndexByte(value, ':') + if colon < 0 { + return "", "", false + } + head := value[:colon] + if head == "" { + return "", "", false + } + if sep := strings.IndexByte(head, '_'); sep >= 0 { + ns := head[:sep] + name := head[sep+1:] + if name == "" { + return "", "", false + } + return ns, name, true + } + return "", head, true +} + +// pickControllerOwner returns the first owner with Controller=true; falls +// back to the first owner if none are marked controller. Returns nil when +// the slice is empty. +func pickControllerOwner(owners []metav1.OwnerReference) *metav1.OwnerReference { + for i := range owners { + if owners[i].Controller != nil && *owners[i].Controller { + return &owners[i] + } + } + if len(owners) > 0 { + return &owners[0] + } + return nil +} + +// groupFromAPIVersion extracts the group from "group/version" or "version" +// (core/v1 form). Mirrors schema.ParseGroupVersion without the import. +func groupFromAPIVersion(apiVersion string) string { + if i := strings.IndexByte(apiVersion, '/'); i >= 0 { + return apiVersion[:i] + } + return "" +} + +// --------------------------------------------------------------------------- +// Uses (Pod-specific) +// --------------------------------------------------------------------------- + +// buildUsesFromPod extracts ConfigMap/Secret/PVC/ServiceAccount references +// from pod.Spec. Returns nil when the pod uses no configuration. +// +// Sources scanned: +// - Volumes: ConfigMap / Secret / PVC / Projected (configMap + secret entries) +// - Containers (init + regular): EnvFrom configMapRef/secretRef, Env valueFrom.{configMap,secret}KeyRef +// - Spec.ServiceAccountName +func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + if pod == nil { + return nil + } + + cmSet := newRefSet() + secretSet := newRefSet() + pvcSet := newRefSet() + + scanVolumes(pod.Spec.Volumes, pod.Namespace, cmSet, secretSet, pvcSet) + scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) + scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) + + uses := &UsesBlock{ + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonEnvVarRef, SourceK8sSpec), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonVolumeMount, SourceK8sSpec), "uses.secrets", omitted), + PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), + } + + if sa := pod.Spec.ServiceAccountName; sa != "" { + candidate := &ContextRef{ + Kind: "ServiceAccount", + Namespace: pod.Namespace, + Name: sa, + Reason: ReasonSAName, + Source: SourceK8sSpec, + } + if checkRef(ctx, ac, candidate) { + uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + if len(uses.ConfigMaps) == 0 && len(uses.Secrets) == 0 && len(uses.PVCs) == 0 && uses.ServiceAccount == nil { + return nil + } + return uses +} + +func scanVolumes(vols []corev1.Volume, ns string, cm, secret, pvc *refSet) { + for _, v := range vols { + if v.ConfigMap != nil { + cm.add(v.ConfigMap.Name, ns) + } + if v.Secret != nil { + secret.add(v.Secret.SecretName, ns) + } + if v.PersistentVolumeClaim != nil { + pvc.add(v.PersistentVolumeClaim.ClaimName, ns) + } + if v.Projected != nil { + for _, src := range v.Projected.Sources { + if src.ConfigMap != nil { + cm.add(src.ConfigMap.Name, ns) + } + if src.Secret != nil { + secret.add(src.Secret.Name, ns) + } + } + } + } +} + +func scanContainers(containers []corev1.Container, ns string, cm, secret *refSet) { + for _, c := range containers { + for _, ef := range c.EnvFrom { + if ef.ConfigMapRef != nil { + cm.add(ef.ConfigMapRef.Name, ns) + } + if ef.SecretRef != nil { + secret.add(ef.SecretRef.Name, ns) + } + } + for _, e := range c.Env { + if e.ValueFrom == nil { + continue + } + if e.ValueFrom.ConfigMapKeyRef != nil { + cm.add(e.ValueFrom.ConfigMapKeyRef.Name, ns) + } + if e.ValueFrom.SecretKeyRef != nil { + secret.add(e.ValueFrom.SecretKeyRef.Name, ns) + } + } + } +} + +// refSet collects (name, namespace) pairs with insertion-order preservation +// for deterministic output. Names with empty namespaces are tolerated (the +// PVC ClaimName can be cluster-scoped only in odd configurations, but we +// pass through whatever the pod spec says). +type refSet struct { + seen map[string]bool + order []nsName +} + +type nsName struct { + Namespace string + Name string +} + +func newRefSet() *refSet { + return &refSet{seen: make(map[string]bool)} +} + +func (s *refSet) add(name, ns string) { + if name == "" { + return + } + key := ns + "/" + name + if s.seen[key] { + return + } + s.seen[key] = true + s.order = append(s.order, nsName{Namespace: ns, Name: name}) +} + +// refs returns the accumulated set as ContextRefs sorted by (namespace, name) +// for deterministic golden output. +func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) []ContextRef { + if len(s.order) == 0 { + return nil + } + out := make([]ContextRef, len(s.order)) + sorted := append([]nsName(nil), s.order...) + sort.Slice(sorted, func(i, j int) bool { + if sorted[i].Namespace != sorted[j].Namespace { + return sorted[i].Namespace < sorted[j].Namespace + } + return sorted[i].Name < sorted[j].Name + }) + for i, e := range sorted { + out[i] = ContextRef{ + Kind: kind, + Group: group, + Namespace: e.Namespace, + Name: e.Name, + Reason: reason, + Source: source, + } + } + return out +} + +// --------------------------------------------------------------------------- +// Topology ref → ContextRef +// --------------------------------------------------------------------------- + +// toContextRefs translates a slice of topology.ResourceRef into ContextRefs +// with the given reason+source. Sorted by (kind, namespace, name) for +// determinism — golden tests rely on this ordering. +func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSource) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + for _, r := range refs { + out = append(out, ContextRef{ + Kind: r.Kind, + Group: r.Group, + Namespace: r.Namespace, + Name: r.Name, + Reason: reason, + Source: source, + }) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Kind != out[j].Kind { + return out[i].Kind < out[j].Kind + } + if out[i].Namespace != out[j].Namespace { + return out[i].Namespace < out[j].Namespace + } + return out[i].Name < out[j].Name + }) + return out +} + +// --------------------------------------------------------------------------- +// RBAC gating +// --------------------------------------------------------------------------- + +// filterRefs applies the access check to each ref. Denied refs are dropped +// and one omitted entry is recorded per field (deduped by the tracker). +// When ac is nil (local-kubeconfig / no auth), every ref passes. +func filterRefs(ctx context.Context, ac RefAccessChecker, refs []ContextRef, fieldPath string, omitted *omittedTracker) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + deniedAny := false + for _, r := range refs { + if !checkRef(ctx, ac, &r) { + deniedAny = true + continue + } + out = append(out, r) + } + if deniedAny { + omitted.add(fieldPath, OmittedRBACDenied) + } + if len(out) == 0 { + return nil + } + return out +} + +// checkRef returns true when ac permits a read of (group, kind, namespace). +// Nil ac = permit everything. +func checkRef(ctx context.Context, ac RefAccessChecker, r *ContextRef) bool { + if ac == nil || r == nil { + return true + } + return ac.CanRead(ctx, r.Group, r.Kind, r.Namespace) +} + +// --------------------------------------------------------------------------- +// Policy summary +// --------------------------------------------------------------------------- + +// buildPolicySummary rolls up Kyverno findings into the summary block. +// Top findings are picked first by fail > warn > error > pass, then by +// stable input order — callers can prune to MAX (3 today). +const policySummaryTopMax = 3 + +func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { + var fail, warn, pass int + for _, f := range findings { + switch f.Result { + case "fail": + fail++ + case "warn": + warn++ + case "pass": + pass++ + } + } + + // Order Top by result priority; cap to policySummaryTopMax. + ordered := append([]KyvernoFinding(nil), findings...) + sort.SliceStable(ordered, func(i, j int) bool { + return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) + }) + if len(ordered) > policySummaryTopMax { + ordered = ordered[:policySummaryTopMax] + } + + return &PolicySummary{ + Kyverno: &KyvernoSummary{ + Fail: fail, + Warn: warn, + Pass: pass, + Top: ordered, + }, + } +} + +func resultRank(r string) int { + switch r { + case "fail": + return 0 + case "warn": + return 1 + case "error": + return 2 + case "pass": + return 3 + default: + return 4 + } +} + +// --------------------------------------------------------------------------- +// Omitted tracker +// --------------------------------------------------------------------------- + +// omittedTracker deduplicates (field, reason) entries so callers don't emit +// "managedBy" / OmittedRBACDenied twice when multiple refs in the same field +// fail. Insertion order is preserved for stable JSON output. +type omittedTracker struct { + seen map[string]bool + items []OmittedField +} + +func newOmittedTracker() *omittedTracker { + return &omittedTracker{seen: make(map[string]bool)} +} + +func (t *omittedTracker) add(field string, reason OmittedReason) { + key := field + "|" + string(reason) + if t.seen[key] { + return + } + t.seen[key] = true + t.items = append(t.items, OmittedField{Field: field, Reason: reason}) +} + +func (t *omittedTracker) collect() []OmittedField { + if len(t.items) == 0 { + return nil + } + return t.items +} diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go new file mode 100644 index 000000000..503fee9d9 --- /dev/null +++ b/pkg/resourcecontext/build_test.go @@ -0,0 +1,590 @@ +package resourcecontext + +import ( + "context" + "encoding/json" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// --------------------------------------------------------------------------- +// Test scaffolding +// --------------------------------------------------------------------------- + +// allowAllChecker permits every CanRead check. Used by the happy-path +// goldens that don't exercise RBAC denial. +type allowAllChecker struct{} + +func (allowAllChecker) CanRead(_ context.Context, _, _, _ string) bool { return true } + +// denyChecker denies a specific (group, kind, namespace) tuple and permits +// everything else. Tests the "omitted: rbac_denied" path without requiring +// the full server stack. +type denyChecker struct { + group string + kind string + namespace string +} + +func (d denyChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + return !(group == d.group && kind == d.kind && namespace == d.namespace) +} + +// mockPolicyReports implements PolicyReportLookup. +type mockPolicyReports map[string][]KyvernoFinding + +func (m mockPolicyReports) FindingsFor(kind, namespace, name string) []KyvernoFinding { + return m[kind+"/"+namespace+"/"+name] +} + +// --------------------------------------------------------------------------- +// Golden-file tests +// --------------------------------------------------------------------------- + +func TestBuild_Pod_FullEnrichment(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-abc", + Namespace: "prod", + Labels: map[string]string{ + "app.kubernetes.io/name": "web", + }, + Annotations: map[string]string{ + argoTrackingIDAnnotation: "argocd_storefront:apps/Deployment:prod/web", + }, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "web-7d", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + ServiceAccountName: "web-sa", + Volumes: []corev1.Volume{ + { + Name: "config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "web-config"}, + }, + }, + }, + { + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }, + { + Name: "data", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "web-data"}, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "web", + EnvFrom: []corev1.EnvFromSource{ + {ConfigMapRef: &corev1.ConfigMapEnvSource{LocalObjectReference: corev1.LocalObjectReference{Name: "shared-env"}}}, + }, + Env: []corev1.EnvVar{ + { + Name: "API_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-key-secret"}, + Key: "key", + }, + }, + }, + }, + }, + }, + }, + } + + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/web-abc", Kind: topology.KindPod, Name: "web-abc"}, + {ID: "service/prod/web", Kind: topology.KindService, Name: "web"}, + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "poddisruptionbudget/prod/web-pdb", Kind: topology.KindPDB, Name: "web-pdb"}, + {ID: "horizontalpodautoscaler/prod/web-hpa", Kind: topology.KindHPA, Name: "web-hpa"}, + }, + Edges: []topology.Edge{ + {Source: "service/prod/web", Target: "pod/prod/web-abc", Type: topology.EdgeRoutesTo}, + {Source: "networkpolicy/prod/default-deny", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "poddisruptionbudget/prod/web-pdb", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "horizontalpodautoscaler/prod/web-hpa", Target: "pod/prod/web-abc", Type: topology.EdgeUses}, + }, + } + + opts := Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + IssueSummary: &IssueSummary{ + Count: 1, HighestSeverity: "critical", TopReason: "ImagePullBackOff", + BySource: map[string]int{"problem": 1}, + }, + } + + rc := Build(context.Background(), pod, opts) + if rc == nil { + t.Fatal("Build returned nil") + } + + // ManagedBy: argo tracking-id annotation wins over owner reference. + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d (%+v)", got, want, rc.ManagedBy) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Application" || mb.Name != "storefront" || mb.Namespace != "argocd" { + t.Errorf("ManagedBy[0]: got %+v, want Application argocd/storefront", mb) + } + if mb.Source != SourceOwnerChain { + t.Errorf("ManagedBy[0].Source: got %q want %q", mb.Source, SourceOwnerChain) + } + + // Exposes: the Service routes to the pod. + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d (%+v)", got, want, rc.Exposes) + } + if rc.Exposes[0].Kind != "Service" || rc.Exposes[0].Name != "web" { + t.Errorf("Exposes[0]: got %+v want Service/prod/web", rc.Exposes[0]) + } + + // SelectedBy: NP + PDB, sorted by kind (NetworkPolicy < PodDisruptionBudget). + if got, want := len(rc.SelectedBy), 2; got != want { + t.Fatalf("SelectedBy len: got %d want %d (%+v)", got, want, rc.SelectedBy) + } + if rc.SelectedBy[0].Kind != "NetworkPolicy" || rc.SelectedBy[1].Kind != "PodDisruptionBudget" { + t.Errorf("SelectedBy order: got %s,%s want NetworkPolicy,PodDisruptionBudget", + rc.SelectedBy[0].Kind, rc.SelectedBy[1].Kind) + } + + // ScaledBy: HPA. + if got, want := len(rc.ScaledBy), 1; got != want { + t.Fatalf("ScaledBy len: got %d want %d", got, want) + } + if rc.ScaledBy[0].Kind != "HorizontalPodAutoscaler" { + t.Errorf("ScaledBy[0].Kind: got %q", rc.ScaledBy[0].Kind) + } + + // RunsOn: Node. + if rc.RunsOn == nil || rc.RunsOn.Name != "node-1" { + t.Errorf("RunsOn: got %+v want Node/node-1", rc.RunsOn) + } + + // Uses: 2 ConfigMaps (web-config + shared-env), 2 Secrets (web-creds + api-key-secret), 1 PVC, ServiceAccount. + if rc.Uses == nil { + t.Fatal("Uses: got nil") + } + if got, want := len(rc.Uses.ConfigMaps), 2; got != want { + t.Errorf("Uses.ConfigMaps len: got %d want %d (%+v)", got, want, rc.Uses.ConfigMaps) + } + if got, want := len(rc.Uses.Secrets), 2; got != want { + t.Errorf("Uses.Secrets len: got %d want %d (%+v)", got, want, rc.Uses.Secrets) + } + if got, want := len(rc.Uses.PVCs), 1; got != want { + t.Errorf("Uses.PVCs len: got %d want %d", got, want) + } + if rc.Uses.ServiceAccount == nil || rc.Uses.ServiceAccount.Name != "web-sa" { + t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) + } + + // Hints: deterministic ordering covers the high-signal fields. + wantHints := []string{ + "Managed by Application storefront", + "1 issue (critical: ImagePullBackOff)", + "Running on node node-1", + "Exposed by 1 Service", + "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", + "Scaled by 1 HorizontalPodAutoscaler", + "Uses 2 ConfigMaps, 2 Secrets, 1 PVC, ServiceAccount web-sa", + } + if !reflect.DeepEqual(rc.Hints, wantHints) { + t.Errorf("Hints mismatch.\n got: %v\nwant: %v", rc.Hints, wantHints) + } + + // Pre-computed summaries are passed through. + if rc.IssueSummary == nil || rc.IssueSummary.Count != 1 { + t.Errorf("IssueSummary not passed through: %+v", rc.IssueSummary) + } + if rc.AuditSummary != nil { + t.Errorf("AuditSummary: want nil, got %+v", rc.AuditSummary) + } +} + +func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { + // Flux HelmRelease labels take precedence over owner references — + // owner is "ReplicaSet web-7d" but Flux labels point at HelmRelease. + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web", + Namespace: "prod", + Labels: map[string]string{ + fluxHelmNameLabel: "web", + fluxHelmNSLabel: "flux-system", + }, + }, + } + + rc := Build(context.Background(), dep, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "HelmRelease" || mb.Name != "web" || mb.Namespace != "flux-system" { + t.Errorf("ManagedBy[0]: got %+v want HelmRelease/flux-system/web", mb) + } + if mb.Group != "helm.toolkit.fluxcd.io" { + t.Errorf("ManagedBy[0].Group: got %q", mb.Group) + } + wantHint := "Managed by HelmRelease web" + if len(rc.Hints) == 0 || rc.Hints[0] != wantHint { + t.Errorf("first Hint: got %v want %q", rc.Hints, wantHint) + } +} + +func TestBuild_Service_ExposedByIngress(t *testing.T) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "service/prod/api", Kind: topology.KindService, Name: "api"}, + {ID: "ingress/prod/api-ingress", Kind: topology.KindIngress, Name: "api-ingress"}, + }, + Edges: []topology.Edge{ + {Source: "ingress/prod/api-ingress", Target: "service/prod/api", Type: topology.EdgeRoutesTo}, + }, + } + rc := Build(context.Background(), svc, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + }) + + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d", got, want) + } + if rc.Exposes[0].Kind != "Ingress" || rc.Exposes[0].Name != "api-ingress" { + t.Errorf("Exposes[0]: got %+v", rc.Exposes[0]) + } + // Service has no Uses block — make sure we don't synthesize an empty one. + if rc.Uses != nil { + t.Errorf("Uses should be nil for Service: got %+v", rc.Uses) + } +} + +func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { + // NetworkPolicy on the "policy side" emits an outgoing EdgeProtects to + // the workload it selects. The topology relationships projection does + // NOT surface that direction (see relationships.go's intentional skip). + // Build inherits this — the NP should have nothing in SelectedBy. + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default-deny", Namespace: "prod"}, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "networkpolicy/prod/default-deny", Target: "deployment/prod/web", Type: topology.EdgeProtects}, + }, + } + rc := Build(context.Background(), np, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if len(rc.SelectedBy) != 0 { + t.Errorf("SelectedBy: expected empty (outgoing EdgeProtects not surfaced), got %+v", rc.SelectedBy) + } +} + +func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { + // A ConfigMap with a controller owner reference. No topology, no Pod + // spec — just owner-chain ManagedBy. + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-config", + Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "Deployment", APIVersion: "apps/v1", Name: "web", Controller: ptrBool(true)}, + }, + }, + } + rc := Build(context.Background(), cm, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" || mb.Group != "apps" { + t.Errorf("ManagedBy[0]: got %+v", mb) + } +} + +func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { + // Deny reads on Secrets in the pod's namespace — buildUsesFromPod + // should drop them all and emit an omitted entry. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}, + Spec: corev1.PodSpec{ + Volumes: []corev1.Volume{{ + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: denyChecker{group: "", kind: "Secret", namespace: "prod"}, + EmitHints: true, + }) + if rc.Uses != nil && len(rc.Uses.Secrets) != 0 { + t.Errorf("Secrets should be empty after deny; got %+v", rc.Uses.Secrets) + } + gotOmitted := false + for _, o := range rc.Omitted { + if o.Field == "uses.secrets" && o.Reason == OmittedRBACDenied { + gotOmitted = true + break + } + } + if !gotOmitted { + t.Errorf("expected omitted [uses.secrets, rbac_denied]; got %+v", rc.Omitted) + } +} + +func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{{Kind: "Foo", APIVersion: "ex.io/v1", Name: "f", Controller: ptrBool(true)}}}, + } + rc := Build(context.Background(), dep, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: false, + }) + if len(rc.Hints) != 0 { + t.Errorf("EmitHints=false but got hints: %v", rc.Hints) + } + // Structured fields still populated. + if len(rc.ManagedBy) != 1 { + t.Errorf("ManagedBy should still be populated: %+v", rc.ManagedBy) + } +} + +func TestBuild_NilObj(t *testing.T) { + if rc := Build(context.Background(), nil, Options{}); rc != nil { + t.Errorf("Build(nil) = %+v, want nil", rc) + } +} + +func TestBuild_HPA_Identity(t *testing.T) { + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "web-hpa", Namespace: "prod"}, + } + rc := Build(context.Background(), hpa, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc == nil { + t.Fatal("Build returned nil for HPA") + } + if rc.Tier != TierBasic { + t.Errorf("Tier: got %q want %q", rc.Tier, TierBasic) + } +} + +func TestBuild_PolicyReports_RolledUp(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + EmitHints: true, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) == 0 || k.Top[0].Result != "fail" { + t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + } + gotHint := false + for _, h := range rc.Hints { + if h == "Kyverno: 1 failing, 1 warning" { + gotHint = true + break + } + } + if !gotHint { + t.Errorf("expected Kyverno hint; got %v", rc.Hints) + } +} + +func TestBuild_PDB_OutputJSONShape(t *testing.T) { + // Pin the wire shape one full populated Build produces, so a future + // reorder of fields (or accidental omitempty change) is caught. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "rs", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{NodeName: "n1"}, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + b, err := json.MarshalIndent(rc, "", " ") + if err != nil { + t.Fatalf("marshal: %v", err) + } + // Spot-check: tier basic, owner ref managedBy, runsOn node. + want := `"managedBy"` + if !contains(string(b), want) { + t.Errorf("JSON missing %s\n%s", want, b) + } + if !contains(string(b), `"tier": "basic"`) { + t.Errorf("JSON missing tier=basic\n%s", b) + } + if !contains(string(b), `"runsOn"`) { + t.Errorf("JSON missing runsOn\n%s", b) + } +} + +// --------------------------------------------------------------------------- +// Sub-helpers' unit coverage +// --------------------------------------------------------------------------- + +func TestParseArgoTrackingID(t *testing.T) { + cases := []struct { + in string + wantNS string + wantName string + wantOK bool + shortName string + }{ + {"argocd_store:apps/Deployment:prod/web", "argocd", "store", true, "namespaced form"}, + {"store:apps/Deployment:prod/web", "", "store", true, "legacy form"}, + {"", "", "", false, "empty"}, + {":foo/bar", "", "", false, "missing head"}, + {"a_:foo", "", "", false, "missing name"}, + } + for _, c := range cases { + t.Run(c.shortName, func(t *testing.T) { + ns, name, ok := parseArgoTrackingID(c.in) + if ns != c.wantNS || name != c.wantName || ok != c.wantOK { + t.Errorf("parseArgoTrackingID(%q) = (%q, %q, %v) want (%q, %q, %v)", + c.in, ns, name, ok, c.wantNS, c.wantName, c.wantOK) + } + }) + } +} + +func TestGroupFromAPIVersion(t *testing.T) { + cases := map[string]string{ + "v1": "", + "apps/v1": "apps", + "argoproj.io/v1alpha1": "argoproj.io", + "networking.k8s.io/v1": "networking.k8s.io", + "helm.toolkit.fluxcd.io/v2beta1": "helm.toolkit.fluxcd.io", + } + for in, want := range cases { + if got := groupFromAPIVersion(in); got != want { + t.Errorf("groupFromAPIVersion(%q) = %q, want %q", in, got, want) + } + } +} + +func TestPickControllerOwner_PrefersController(t *testing.T) { + owners := []metav1.OwnerReference{ + {Kind: "Other", Name: "x"}, + {Kind: "Boss", Name: "ctrl", Controller: ptrBool(true)}, + } + got := pickControllerOwner(owners) + if got == nil || got.Name != "ctrl" { + t.Errorf("got %+v, want ctrl", got) + } +} + +func TestPickControllerOwner_FallsBackToFirst(t *testing.T) { + owners := []metav1.OwnerReference{ + {Kind: "Solo", Name: "first"}, + {Kind: "Other", Name: "x"}, + } + got := pickControllerOwner(owners) + if got == nil || got.Name != "first" { + t.Errorf("got %+v, want first", got) + } + if got := pickControllerOwner(nil); got != nil { + t.Errorf("nil owners should return nil, got %+v", got) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func ptrBool(b bool) *bool { return &b } + +func contains(s, sub string) bool { + return len(s) >= len(sub) && indexOf(s, sub) >= 0 +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} + +// Compile-time pin: keep PDB and Networking imports referenced for future tests. +var ( + _ = policyv1.PodDisruptionBudget{} +) diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go new file mode 100644 index 000000000..30ccd698b --- /dev/null +++ b/pkg/resourcecontext/hints.go @@ -0,0 +1,232 @@ +package resourcecontext + +import ( + "fmt" + "sort" + "strings" +) + +// SynthesizeHints renders a short, deterministic prose summary of the +// structured fields in c. Returns at most maxHintsBasic lines for +// TierBasic; future tiers can expand the budget. +// +// Ordering is fixed (not data-driven) so golden tests stay stable across +// runs. No LLM is involved — every line maps to a single rule. +// +// Callers SHOULD NOT parse hints — the structured fields are the canonical +// surface. Hints exist solely as a prose convenience for AI consumers. +func SynthesizeHints(c *ResourceContext, tier ContextTier) []string { + if c == nil { + return nil + } + + max := maxHintsBasic + if tier == TierDiagnostic { + max = maxHintsDiagnostic + } + + out := make([]string, 0, max) + + if h := managedByHint(c.ManagedBy); h != "" { + out = append(out, h) + } + if h := issueHint(c.IssueSummary); h != "" { + out = append(out, h) + } + if h := auditHint(c.AuditSummary); h != "" { + out = append(out, h) + } + if h := runsOnHint(c.RunsOn); h != "" { + out = append(out, h) + } + if h := exposesHint(c.Exposes); h != "" { + out = append(out, h) + } + if h := selectedByHint(c.SelectedBy); h != "" { + out = append(out, h) + } + if h := scaledByHint(c.ScaledBy); h != "" { + out = append(out, h) + } + if h := usesHint(c.Uses); h != "" { + out = append(out, h) + } + if h := policyHint(c.PolicySummary); h != "" { + out = append(out, h) + } + + if len(out) > max { + out = out[:max] + } + if len(out) == 0 { + return nil + } + return out +} + +const ( + maxHintsBasic = 8 + maxHintsDiagnostic = 12 +) + +func managedByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + m := refs[0] + return fmt.Sprintf("Managed by %s %s", m.Kind, m.Name) +} + +func issueHint(s *IssueSummary) string { + if s == nil || s.Count == 0 { + return "" + } + noun := pluralize("issue", s.Count) + var b strings.Builder + fmt.Fprintf(&b, "%d %s", s.Count, noun) + if s.HighestSeverity != "" { + fmt.Fprintf(&b, " (%s", s.HighestSeverity) + if s.TopReason != "" { + fmt.Fprintf(&b, ": %s", s.TopReason) + } + b.WriteString(")") + } else if s.TopReason != "" { + fmt.Fprintf(&b, ": %s", s.TopReason) + } + return b.String() +} + +func auditHint(s *AuditSummary) string { + if s == nil || s.Count == 0 { + return "" + } + noun := pluralize("audit finding", s.Count) + if s.HighestSeverity == "" { + return fmt.Sprintf("%d %s", s.Count, noun) + } + return fmt.Sprintf("%d %s (%s)", s.Count, noun, s.HighestSeverity) +} + +func runsOnHint(r *ContextRef) string { + if r == nil { + return "" + } + return fmt.Sprintf("Running on node %s", r.Name) +} + +func exposesHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + return fmt.Sprintf("Exposed by %s", summarizeKindsCounts(refs)) +} + +func selectedByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + // Distinguish PDB vs NetworkPolicy in the hint — they read very + // differently to a human, and lumping them together loses signal. + var pdb, np []ContextRef + for _, r := range refs { + if r.Kind == "PodDisruptionBudget" { + pdb = append(pdb, r) + } else { + np = append(np, r) + } + } + parts := make([]string, 0, 2) + if n := len(np); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) + } + if n := len(pdb); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) + } + return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) +} + +func selectVerb(n int) string { + if n == 1 { + return "selects this resource" + } + return "select this resource" +} + +func scaledByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + return fmt.Sprintf("Scaled by %s", summarizeKindsCounts(refs)) +} + +func usesHint(u *UsesBlock) string { + if u == nil { + return "" + } + parts := make([]string, 0, 4) + if n := len(u.ConfigMaps); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("ConfigMap", n))) + } + if n := len(u.Secrets); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("Secret", n))) + } + if n := len(u.PVCs); n > 0 { + parts = append(parts, fmt.Sprintf("%d PVCs", n)) + if n == 1 { + parts[len(parts)-1] = "1 PVC" + } + } + if u.ServiceAccount != nil { + parts = append(parts, fmt.Sprintf("ServiceAccount %s", u.ServiceAccount.Name)) + } + if len(parts) == 0 { + return "" + } + return "Uses " + strings.Join(parts, ", ") +} + +func policyHint(s *PolicySummary) string { + if s == nil || s.Kyverno == nil { + return "" + } + k := s.Kyverno + if k.Fail == 0 && k.Warn == 0 { + return "" + } + parts := make([]string, 0, 2) + if k.Fail > 0 { + parts = append(parts, fmt.Sprintf("%d failing", k.Fail)) + } + if k.Warn > 0 { + parts = append(parts, fmt.Sprintf("%d warning", k.Warn)) + } + return "Kyverno: " + strings.Join(parts, ", ") +} + +// summarizeKindsCounts groups refs by kind and emits "N Kind, M OtherKind" +// (deterministic order: alphabetical by kind). +func summarizeKindsCounts(refs []ContextRef) string { + counts := make(map[string]int) + for _, r := range refs { + counts[r.Kind]++ + } + kinds := make([]string, 0, len(counts)) + for k := range counts { + kinds = append(kinds, k) + } + sort.Strings(kinds) + parts := make([]string, 0, len(kinds)) + for _, k := range kinds { + parts = append(parts, fmt.Sprintf("%d %s", counts[k], pluralize(k, counts[k]))) + } + return strings.Join(parts, ", ") +} + +// pluralize returns word + "s" when n != 1. Kept English-only; resource +// kinds are loanwords (Pod, Service, etc.) so naive pluralization works. +func pluralize(word string, n int) string { + if n == 1 { + return word + } + return word + "s" +} diff --git a/pkg/resourcecontext/hints_test.go b/pkg/resourcecontext/hints_test.go new file mode 100644 index 000000000..8bc0a30d6 --- /dev/null +++ b/pkg/resourcecontext/hints_test.go @@ -0,0 +1,118 @@ +package resourcecontext + +import ( + "reflect" + "testing" +) + +func TestSynthesizeHints_NilCtx(t *testing.T) { + if got := SynthesizeHints(nil, TierBasic); got != nil { + t.Errorf("nil ctx: got %v, want nil", got) + } +} + +func TestSynthesizeHints_EmptyCtx(t *testing.T) { + rc := &ResourceContext{Tier: TierBasic} + got := SynthesizeHints(rc, TierBasic) + if got != nil { + t.Errorf("empty rc: got %v, want nil", got) + } +} + +func TestSynthesizeHints_DeterministicOrdering(t *testing.T) { + rc := &ResourceContext{ + ManagedBy: []ContextRef{{Kind: "Application", Name: "store"}}, + Exposes: []ContextRef{{Kind: "Service", Name: "api"}}, + SelectedBy: []ContextRef{ + {Kind: "NetworkPolicy", Name: "deny"}, + {Kind: "PodDisruptionBudget", Name: "pdb"}, + }, + ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "hpa"}}, + RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, + Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}}, + IssueSummary: &IssueSummary{Count: 2, HighestSeverity: "warning", TopReason: "Backoff"}, + AuditSummary: &AuditSummary{Count: 3, HighestSeverity: "danger"}, + } + want := []string{ + "Managed by Application store", + "2 issues (warning: Backoff)", + "3 audit findings (danger)", + "Running on node n1", + "Exposed by 1 Service", + "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", + "Scaled by 1 HorizontalPodAutoscaler", + "Uses 1 ConfigMap", + } + got := SynthesizeHints(rc, TierBasic) + if !reflect.DeepEqual(got, want) { + t.Errorf("hints mismatch:\n got: %v\nwant: %v", got, want) + } +} + +func TestSynthesizeHints_BasicTierCapped(t *testing.T) { + // Synthesize a maxed-out context and verify the basic tier caps at + // maxHintsBasic lines. This guards against unbounded hint growth. + rc := &ResourceContext{ + ManagedBy: []ContextRef{{Kind: "App", Name: "a"}}, + Exposes: []ContextRef{{Kind: "Service", Name: "svc"}}, + SelectedBy: []ContextRef{{Kind: "PodDisruptionBudget", Name: "p"}, {Kind: "NetworkPolicy", Name: "n"}}, + ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "h"}}, + RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, + Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}, Secrets: []ContextRef{{Kind: "Secret", Name: "s"}}}, + IssueSummary: &IssueSummary{Count: 1, HighestSeverity: "critical", TopReason: "Crash"}, + AuditSummary: &AuditSummary{Count: 1, HighestSeverity: "danger", TopFinding: "CKV_K8S_1"}, + PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Fail: 1, Warn: 1}}, + } + got := SynthesizeHints(rc, TierBasic) + if len(got) > maxHintsBasic { + t.Errorf("basic tier exceeded cap: got %d hints, want ≤%d (%v)", len(got), maxHintsBasic, got) + } +} + +func TestSynthesizeHints_IssueHint_NoSeverity(t *testing.T) { + rc := &ResourceContext{IssueSummary: &IssueSummary{Count: 1, TopReason: "Pending"}} + got := SynthesizeHints(rc, TierBasic) + want := []string{"1 issue: Pending"} + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} + +func TestSynthesizeHints_PolicyHint_OnlyPass_Skipped(t *testing.T) { + rc := &ResourceContext{PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Pass: 3}}} + got := SynthesizeHints(rc, TierBasic) + if got != nil { + t.Errorf("only-pass summary should not emit a hint; got %v", got) + } +} + +func TestUsesHint_PVCSingular(t *testing.T) { + rc := &ResourceContext{Uses: &UsesBlock{PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data"}}}} + got := SynthesizeHints(rc, TierBasic) + want := []string{"Uses 1 PVC"} + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} + +func TestSelectVerb(t *testing.T) { + if selectVerb(1) != "selects this resource" { + t.Errorf("verb(1): %q", selectVerb(1)) + } + if selectVerb(2) != "select this resource" { + t.Errorf("verb(2): %q", selectVerb(2)) + } +} + +func TestSummarizeKindsCounts_AlphabeticalOrder(t *testing.T) { + refs := []ContextRef{ + {Kind: "Service", Name: "a"}, + {Kind: "Ingress", Name: "b"}, + {Kind: "Service", Name: "c"}, + } + got := summarizeKindsCounts(refs) + want := "1 Ingress, 2 Services" + if got != want { + t.Errorf("got %q want %q", got, want) + } +} From 8c9e0b0b2ce9d01d65ac11dd99015de800408e8f Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 10:48:50 +0300 Subject: [PATCH 02/33] fix(ai-handlers): wire PolicyReport index into ResourceContext.Build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught that buildAIResourceContext never set opts.PolicyReports, so policySummary.kyverno stayed absent on the REST get-resource path even after T5 (index) and T11 (composer source). The Build generator already supports it via PolicyReportLookup; the handler just wasn't passing it. Adds a narrow policyReportLookupAdapter that wraps internal/k8s.GetPolicyReportIndex() — translating the richer pkg/policyreports.Finding shape (Severity + Category) into the agent-facing resourcecontext.KyvernoFinding shape (Policy / Rule / Result / Message). Keeping the projection narrow at the adapter layer means future additions to policyreports.Finding don't perturb the wire contract. When Kyverno isn't installed, GetPolicyReportIndex() returns nil and opts.PolicyReports stays unset — Build emits no policySummary, which is the correct degraded behavior. --- internal/server/ai_handlers.go | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 4db391ef2..35adde577 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -15,10 +15,42 @@ import ( "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" "github.com/skyhook-io/radar/pkg/resourcecontext" "github.com/skyhook-io/radar/pkg/topology" ) +// policyReportLookupAdapter wraps internal/k8s.GetPolicyReportIndex() into +// the resourcecontext.PolicyReportLookup interface, translating the +// richer pkg/policyreports.Finding shape (which carries Severity + +// Category) into the agent-facing resourcecontext.KyvernoFinding shape +// (Policy / Rule / Result / Message only). Keeping the projection narrow +// here lets unrelated changes to policyreports.Finding evolve without +// perturbing the wire contract that downstream callers depend on. +type policyReportLookupAdapter struct { + idx *policyreports.Index +} + +func (a policyReportLookupAdapter) FindingsFor(kind, namespace, name string) []resourcecontext.KyvernoFinding { + if a.idx == nil { + return nil + } + findings := a.idx.FindingsFor(kind, namespace, name) + if len(findings) == 0 { + return nil + } + out := make([]resourcecontext.KyvernoFinding, len(findings)) + for i, f := range findings { + out[i] = resourcecontext.KyvernoFinding{ + Policy: f.Policy, + Rule: f.Rule, + Result: f.Result, + Message: f.Message, + } + } + return out +} + // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aicontext.VerbosityLevel { switch r.URL.Query().Get("verbosity") { @@ -249,6 +281,13 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin AuditSummary: auditSum, } + // Wire the PolicyReport index when Kyverno is installed. Build emits a + // counts-only `policySummary.kyverno` on the basic tier; diagnostic + // tier (T10) will surface the top[] findings. + if idx := k8s.GetPolicyReportIndex(); idx != nil { + opts.PolicyReports = policyReportLookupAdapter{idx: idx} + } + if topo, prov, dyn, ok := s.topologyForContext(namespace); ok { opts.Topology = topo opts.Provider = prov From 2aec8ec38281ecd3e6d94f8234d6f57850ce2279 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 11:06:04 +0300 Subject: [PATCH 03/33] =?UTF-8?q?fix(resourcecontext):=20tier-aware=20Poli?= =?UTF-8?q?cySummary=20=E2=80=94=20basic=20emits=20counts=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught a plan-vs-code mismatch: the locked v1 contract says basic tier emits Kyverno counts only (fail/warn/pass), with Top[] findings reserved for diagnostic tier. The code emitted Top[] on both tiers, inflating basic-tier wire size with details that belong on the deeper agent investigation path. Fix: buildPolicySummary now takes ContextTier. Basic emits counts only; diagnostic adds the Top[] (cap 3, ordered fail > warn > error > pass). Existing TestBuild_PolicyReports_RolledUp split into two cases that pin both tier outputs. --- pkg/resourcecontext/build.go | 44 ++++++++++++++++++------------- pkg/resourcecontext/build_test.go | 40 +++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index c3d7374d4..1eaf9437f 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -143,11 +143,13 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte rc.IssueSummary = opts.IssueSummary rc.AuditSummary = opts.AuditSummary - // 5. PolicyReports — Kyverno findings rolled up. + // 5. PolicyReports — Kyverno findings rolled up. Basic tier emits + // counts only (fail/warn/pass); diagnostic tier adds the top[] + // findings. Tier discrimination keeps the basic-tier wire size tight. if opts.PolicyReports != nil { findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) if len(findings) > 0 { - rc.PolicySummary = buildPolicySummary(findings) + rc.PolicySummary = buildPolicySummary(findings, opts.Tier) } } @@ -639,10 +641,14 @@ func checkRef(ctx context.Context, ac RefAccessChecker, r *ContextRef) bool { // buildPolicySummary rolls up Kyverno findings into the summary block. // Top findings are picked first by fail > warn > error > pass, then by -// stable input order — callers can prune to MAX (3 today). +// stable input order — capped at policySummaryTopMax. +// +// Tier discrimination: basic emits counts only (Fail/Warn/Pass) for a +// minimal wire footprint; diagnostic adds the Top[] findings. Locked +// in the plan's v1 contract. const policySummaryTopMax = 3 -func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { +func buildPolicySummary(findings []KyvernoFinding, tier ContextTier) *PolicySummary { var fail, warn, pass int for _, f := range findings { switch f.Result { @@ -655,23 +661,25 @@ func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { } } - // Order Top by result priority; cap to policySummaryTopMax. - ordered := append([]KyvernoFinding(nil), findings...) - sort.SliceStable(ordered, func(i, j int) bool { - return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) - }) - if len(ordered) > policySummaryTopMax { - ordered = ordered[:policySummaryTopMax] + ks := &KyvernoSummary{ + Fail: fail, + Warn: warn, + Pass: pass, } - return &PolicySummary{ - Kyverno: &KyvernoSummary{ - Fail: fail, - Warn: warn, - Pass: pass, - Top: ordered, - }, + // Top[] only on diagnostic tier. Basic stays counts-only. + if tier == TierDiagnostic { + ordered := append([]KyvernoFinding(nil), findings...) + sort.SliceStable(ordered, func(i, j int) bool { + return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) + }) + if len(ordered) > policySummaryTopMax { + ordered = ordered[:policySummaryTopMax] + } + ks.Top = ordered } + + return &PolicySummary{Kyverno: ks} } func resultRank(r string) int { diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 503fee9d9..bfdccccf8 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -425,7 +425,9 @@ func TestBuild_HPA_Identity(t *testing.T) { } } -func TestBuild_PolicyReports_RolledUp(t *testing.T) { +func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { + // Basic tier emits counts only (fail/warn/pass). Top[] is reserved + // for diagnostic tier — keeps the basic-tier wire footprint minimal. pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} reports := mockPolicyReports{ "Pod/prod/p": { @@ -447,8 +449,8 @@ func TestBuild_PolicyReports_RolledUp(t *testing.T) { if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) } - if len(k.Top) == 0 || k.Top[0].Result != "fail" { - t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + if len(k.Top) != 0 { + t.Errorf("basic tier must NOT emit Top[]; got %d entries: %+v", len(k.Top), k.Top) } gotHint := false for _, h := range rc.Hints { @@ -462,6 +464,38 @@ func TestBuild_PolicyReports_RolledUp(t *testing.T) { } } +func TestBuild_PolicyReports_DiagnosticTierIncludesTop(t *testing.T) { + // Diagnostic tier adds the Top[] findings (capped at 3, ordered + // fail > warn > error > pass). Used by the deep agent investigation + // path — basic tier is for everyday triage. + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierDiagnostic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) == 0 { + t.Fatal("diagnostic tier must emit Top[] findings") + } + if k.Top[0].Result != "fail" { + t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + } +} + func TestBuild_PDB_OutputJSONShape(t *testing.T) { // Pin the wire shape one full populated Build produces, so a future // reorder of fields (or accidental omitempty change) is caught. From 045097cea7491795d67ee17c77b477ae1a8f1ea4 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:12:59 +0300 Subject: [PATCH 04/33] refactor(resourcecontext): consume topology.Relationships.ManagedBy + Pod hygiene fields (T23 dedup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit T23 (#720) made topology.Relationships the canonical projection for server-synthesized ManagedBy (owner-chain + GitOps detection) and Pod hygiene fields (ServiceAccount, Node). Drop the parallel owner-walk and label/annotation scanning inside resourcecontext.Build — read Relationships.{ManagedBy,ServiceAccount,Node} instead. Build now calls topology.GetRelationshipsWithObject when Topology is set, passing the fetched obj so kind/group disambiguation works (Knative serving.knative.dev/Service vs core/v1 Service). Single- resource callers (REST GET /api/ai/resources) pass idx=nil — the per- call O(E) scan is fine for one walk per request; bulk callers should pre-build via topology.IndexByResource(topo) and pass through Options.RelIndex or Options.Relationships. No deprecation aliases: buildManagedBy, parseArgoTrackingID, pickControllerOwner, groupFromAPIVersion, readPair, and the gitops/flux label constants are deleted outright (per saved feedback preference). Their tests are removed — the same logic is exercised in pkg/topology/managedby_test.go. --- internal/server/ai_handlers.go | 8 + pkg/resourcecontext/build.go | 290 ++++++++++++------------------ pkg/resourcecontext/build_test.go | 112 ++++-------- 3 files changed, 154 insertions(+), 256 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 35adde577..3b27b9021 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -292,6 +292,14 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts.Topology = topo opts.Provider = prov opts.DynamicProv = dyn + // Pre-compute Relationships once with the already-fetched obj so + // kind/group disambiguation works (Knative serving.knative.dev/Service + // vs core/v1 Service). idx=nil is fine for single-resource: the + // per-call inline scan is O(E) once. Bulk callers (T12/T89) should + // build a shared index via topology.IndexByResource(topo). + opts.Relationships = topology.GetRelationshipsWithObject( + kind, namespace, name, obj, topo, prov, dyn, nil, + ) } return resourcecontext.Build(r.Context(), obj, opts) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 1eaf9437f..ea69d041f 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -3,7 +3,6 @@ package resourcecontext import ( "context" "sort" - "strings" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -34,11 +33,28 @@ type Options struct { AccessChecker RefAccessChecker // Topology data sources. When Topology is nil, the topology-derived - // fields (Exposes, SelectedBy, ScaledBy) are skipped. + // fields (Exposes, SelectedBy, ScaledBy, ManagedBy, RunsOn, + // Uses.ServiceAccount) are skipped. Topology *topology.Topology Provider topology.ResourceProvider DynamicProv topology.DynamicProvider + // Relationships is the pre-computed per-resource projection. When non-nil, + // Build consumes it directly instead of calling + // topology.GetRelationshipsWithObject — single-resource handlers should + // leave this nil and let Build compute; bulk/list callers that already + // loop over relationships per row SHOULD pass it to avoid double work. + // + // Topology MUST still be set when Relationships is set — synthesis + // helpers (e.g. ManagedBy owner walk) read Topology and RelIndex through + // it. + Relationships *topology.Relationships + + // RelIndex is the topology inverted-edge index. Pass a shared instance + // (topology.IndexByResource(topo)) for high-fanout callers; nil is fine + // for single-resource Build paths — the per-call inline scan is O(E) once. + RelIndex *topology.RelationshipsIndex + // Pre-computed summaries — pass-through into the response. IssueSummary *IssueSummary AuditSummary *AuditSummary @@ -86,18 +102,47 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte rc := &ResourceContext{Tier: opts.Tier} omitted := newOmittedTracker() - // 1. ManagedBy — owner chain + GitOps labels/annotations - rc.ManagedBy = filterRefs( - ctx, opts.AccessChecker, - buildManagedBy(ident), - "managedBy", omitted, - ) + // Topology-derived relationships drive ManagedBy / Exposes / SelectedBy / + // ScaledBy / RunsOn / Uses.ServiceAccount. T23 made + // topology.Relationships the canonical projection: server-side + // SynthesizeManagedBy walks the owner chain + GitOps signals, and the Pod + // hygiene fields (.ServiceAccount, .Node) are populated from pod.Spec. + // We do NOT re-walk owner refs here — that would duplicate the topology + // package's logic and risk drift. + // + // Single-resource callers (REST GET, MCP get_resource) leave + // opts.Relationships nil and let us compute via GetRelationshipsWithObject + // — passing obj keeps kind/group disambiguation correct for CRDs whose + // plural collides with a core resource. Bulk callers that already loop + // over relationships per row pass them in directly. + rel := opts.Relationships + if rel == nil && opts.Topology != nil { + rel = topology.GetRelationshipsWithObject( + ident.Kind, ident.Namespace, ident.Name, obj, + opts.Topology, opts.Provider, opts.DynamicProv, opts.RelIndex, + ) + } + + // 1. ManagedBy — prefer Relationships.ManagedBy (server-synthesized when + // a topology is available; covers GitOps signals + owner-chain walk). + // Fall back to topology.SynthesizeManagedBy with the obj alone when no + // topology is provided — that path still detects Argo/Flux/Helm signals + // from labels and annotations without needing a graph. + var managedBy []topology.ResourceRef + if rel != nil && len(rel.ManagedBy) > 0 { + managedBy = rel.ManagedBy + } else if rel == nil { + if m, ok := obj.(metav1.Object); ok { + managedBy = topology.SynthesizeManagedBy(m, ident.Kind, ident.Namespace, ident.Name, nil, nil, nil) + } + } + if len(managedBy) > 0 { + rc.ManagedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(managedBy, ReasonOwnerReference, SourceOwnerChain), + "managedBy", omitted) + } // 2. Topology-derived: Exposes, SelectedBy, ScaledBy - var rel *topology.Relationships - if opts.Topology != nil { - rel = topology.GetRelationships(ident.Kind, ident.Namespace, ident.Name, opts.Topology, opts.Provider, opts.DynamicProv) - } if rel != nil { exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) exposes = append(exposes, rel.Services...) @@ -120,14 +165,49 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte "scaledBy", omitted) } - // 3. Pod-specific: Uses + RunsOn + // 3. Pod-specific: RunsOn (Node) + Uses (ConfigMap/Secret/PVC/SA). + // + // RunsOn and Uses.ServiceAccount come from topology.Relationships when + // available (T23 populates them from pod.Spec server-side). We still + // scan pod.Spec.Volumes / .EnvFrom directly for the ConfigMap/Secret/PVC + // inventory — topology doesn't model those use-edges at the granularity + // Build needs. if pod, ok := obj.(*corev1.Pod); ok { rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) - if pod.Spec.NodeName != "" { + // Prefer rel.ServiceAccount over re-reading pod.Spec — same source, + // but consolidating through Relationships keeps Build aligned with + // how MCP/agents consume the field. + if rc.Uses != nil && rc.Uses.ServiceAccount == nil && rel != nil && rel.ServiceAccount != nil { + candidate := &ContextRef{ + Kind: rel.ServiceAccount.Kind, + Group: rel.ServiceAccount.Group, + Namespace: rel.ServiceAccount.Namespace, + Name: rel.ServiceAccount.Name, + Reason: ReasonSAName, + Source: SourceK8sSpec, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.Uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + // RunsOn: prefer the topology-supplied Node ref; fall back to + // pod.Spec.NodeName only when topology is absent (no rel). + var nodeName, nodeGroup string + if rel != nil && rel.Node != nil { + nodeName = rel.Node.Name + nodeGroup = rel.Node.Group + } else if rel == nil { + nodeName = pod.Spec.NodeName + } + if nodeName != "" { candidate := &ContextRef{ Kind: "Node", - Name: pod.Spec.NodeName, + Group: nodeGroup, + Name: nodeName, Reason: ReasonNodeName, Source: SourceK8sSpec, } @@ -167,16 +247,14 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // --------------------------------------------------------------------------- // resourceIdentity is the projection of obj that Build needs without holding -// on to the full runtime.Object. Owner refs and labels feed ManagedBy; the -// (Kind, Namespace, Name) tuple keys topology + summary lookups. +// on to the full runtime.Object. The (Kind, Namespace, Name) tuple keys +// topology relationship lookups and summary lookups; Group is retained for +// future use by callers inspecting the identity directly. type resourceIdentity struct { - Kind string - Group string - Namespace string - Name string - Labels map[string]string - Annotations map[string]string - Owners []metav1.OwnerReference + Kind string + Group string + Namespace string + Name string } // identityOf extracts identity from a typed K8s object or unstructured. @@ -241,13 +319,10 @@ func identityOf(obj runtime.Object) (resourceIdentity, bool) { case *unstructured.Unstructured: gvk := v.GroupVersionKind() return resourceIdentity{ - Kind: gvk.Kind, - Group: gvk.Group, - Namespace: v.GetNamespace(), - Name: v.GetName(), - Labels: v.GetLabels(), - Annotations: v.GetAnnotations(), - Owners: v.GetOwnerReferences(), + Kind: gvk.Kind, + Group: gvk.Group, + Namespace: v.GetNamespace(), + Name: v.GetName(), }, true } return resourceIdentity{}, false @@ -255,156 +330,11 @@ func identityOf(obj runtime.Object) (resourceIdentity, bool) { func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { return resourceIdentity{ - Kind: kind, - Group: group, - Namespace: m.Namespace, - Name: m.Name, - Labels: m.Labels, - Annotations: m.Annotations, - Owners: m.OwnerReferences, - } -} - -// --------------------------------------------------------------------------- -// ManagedBy detection -// --------------------------------------------------------------------------- - -// GitOps label/annotation keys — kept in sync with packages/k8s-ui/src/utils/gitops-owner.ts. -const ( - argoTrackingIDAnnotation = "argocd.argoproj.io/tracking-id" - argoInstanceLabel = "argocd.argoproj.io/instance" - fluxKustomizeNameLabel = "kustomize.toolkit.fluxcd.io/name" - fluxKustomizeNSLabel = "kustomize.toolkit.fluxcd.io/namespace" - fluxHelmNameLabel = "helm.toolkit.fluxcd.io/name" - fluxHelmNSLabel = "helm.toolkit.fluxcd.io/namespace" -) - -// buildManagedBy returns the ContextRefs describing what manages this -// resource. Precedence (most-specific wins): -// 1. Flux HelmRelease labels -// 2. Flux Kustomization labels -// 3. Argo tracking-id annotation -// 4. Argo instance label -// 5. First owner reference (controller=true preferred) -// -// Only one path emits today — the field is a slice so future taxonomies -// (e.g. dual ArgoCD + Flux) can list multiple managers without a wire change. -func buildManagedBy(ident resourceIdentity) []ContextRef { - if name, ns, ok := readPair(ident.Labels, fluxHelmNameLabel, fluxHelmNSLabel); ok { - return []ContextRef{{ - Kind: "HelmRelease", - Group: "helm.toolkit.fluxcd.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - if name, ns, ok := readPair(ident.Labels, fluxKustomizeNameLabel, fluxKustomizeNSLabel); ok { - return []ContextRef{{ - Kind: "Kustomization", - Group: "kustomize.toolkit.fluxcd.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - if id := ident.Annotations[argoTrackingIDAnnotation]; id != "" { - if ns, name, ok := parseArgoTrackingID(id); ok && name != "" { - return []ContextRef{{ - Kind: "Application", - Group: "argoproj.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - } - if inst := ident.Labels[argoInstanceLabel]; inst != "" { - // App namespace unknown without tracking-id — emit with empty ns - // like the UI does; the consumer decides whether to navigate. - return []ContextRef{{ - Kind: "Application", - Group: "argoproj.io", - Name: inst, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - - if owner := pickControllerOwner(ident.Owners); owner != nil { - group := groupFromAPIVersion(owner.APIVersion) - return []ContextRef{{ - Kind: owner.Kind, - Group: group, - Namespace: ident.Namespace, - Name: owner.Name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - return nil -} - -func readPair(m map[string]string, k1, k2 string) (string, string, bool) { - a := m[k1] - b := m[k2] - if a == "" || b == "" { - return "", "", false - } - return a, b, true -} - -// parseArgoTrackingID mirrors gitops-owner.ts. Two forms: -// -// ":..." (legacy, single name) -// "_:..." (namespaced install) -// -// Returns (ns, name, ok). -func parseArgoTrackingID(value string) (string, string, bool) { - colon := strings.IndexByte(value, ':') - if colon < 0 { - return "", "", false - } - head := value[:colon] - if head == "" { - return "", "", false - } - if sep := strings.IndexByte(head, '_'); sep >= 0 { - ns := head[:sep] - name := head[sep+1:] - if name == "" { - return "", "", false - } - return ns, name, true - } - return "", head, true -} - -// pickControllerOwner returns the first owner with Controller=true; falls -// back to the first owner if none are marked controller. Returns nil when -// the slice is empty. -func pickControllerOwner(owners []metav1.OwnerReference) *metav1.OwnerReference { - for i := range owners { - if owners[i].Controller != nil && *owners[i].Controller { - return &owners[i] - } - } - if len(owners) > 0 { - return &owners[0] - } - return nil -} - -// groupFromAPIVersion extracts the group from "group/version" or "version" -// (core/v1 form). Mirrors schema.ParseGroupVersion without the import. -func groupFromAPIVersion(apiVersion string) string { - if i := strings.IndexByte(apiVersion, '/'); i >= 0 { - return apiVersion[:i] + Kind: kind, + Group: group, + Namespace: m.Namespace, + Name: m.Name, } - return "" } // --------------------------------------------------------------------------- diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index bfdccccf8..3a71b81ec 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -59,7 +59,7 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { "app.kubernetes.io/name": "web", }, Annotations: map[string]string{ - argoTrackingIDAnnotation: "argocd_storefront:apps/Deployment:prod/web", + "argocd.argoproj.io/tracking-id": "argocd_storefront:apps/Deployment:prod/web", }, OwnerReferences: []metav1.OwnerReference{ {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "web-7d", Controller: ptrBool(true)}, @@ -234,8 +234,8 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { Name: "web", Namespace: "prod", Labels: map[string]string{ - fluxHelmNameLabel: "web", - fluxHelmNSLabel: "flux-system", + "helm.toolkit.fluxcd.io/name": "web", + "helm.toolkit.fluxcd.io/namespace": "flux-system", }, }, } @@ -328,8 +328,10 @@ func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { } func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { - // A ConfigMap with a controller owner reference. No topology, no Pod - // spec — just owner-chain ManagedBy. + // A ConfigMap owned by a Deployment via EdgeManages — owner-chain + // ManagedBy is sourced from topology.SynthesizeManagedBy walking the + // owner graph (T23 canonical projection). No Pod spec, no GitOps + // labels — just the topology owner edge. cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "web-config", @@ -339,16 +341,26 @@ func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { }, }, } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "configmap/prod/web-config", Kind: topology.KindConfigMap, Name: "web-config"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "deployment/prod/web", Target: "configmap/prod/web-config", Type: topology.EdgeManages}, + }, + } rc := Build(context.Background(), cm, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, + Topology: topo, EmitHints: true, }) if got, want := len(rc.ManagedBy), 1; got != want { t.Fatalf("ManagedBy len: got %d want %d", got, want) } mb := rc.ManagedBy[0] - if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" || mb.Group != "apps" { + if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" { t.Errorf("ManagedBy[0]: got %+v", mb) } } @@ -388,9 +400,14 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { } func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { + // Flux Helm labels — detected from obj metadata directly via + // topology.SynthesizeManagedBy without needing a populated Topology. dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", - OwnerReferences: []metav1.OwnerReference{{Kind: "Foo", APIVersion: "ex.io/v1", Name: "f", Controller: ptrBool(true)}}}, + Labels: map[string]string{ + "helm.toolkit.fluxcd.io/name": "web", + "helm.toolkit.fluxcd.io/namespace": "flux-system", + }}, } rc := Build(context.Background(), dep, Options{ Tier: TierBasic, @@ -508,9 +525,21 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { }, Spec: corev1.PodSpec{NodeName: "n1"}, } + // Topology with the owner edge so SynthesizeManagedBy can walk the + // chain and emit a ReplicaSet ManagedBy ref for wire-shape coverage. + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/p", Kind: topology.KindPod, Name: "p"}, + {ID: "replicaset/prod/rs", Kind: topology.KindReplicaSet, Name: "rs"}, + }, + Edges: []topology.Edge{ + {Source: "replicaset/prod/rs", Target: "pod/prod/p", Type: topology.EdgeManages}, + }, + } rc := Build(context.Background(), pod, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, + Topology: topo, EmitHints: true, }) b, err := json.MarshalIndent(rc, "", " ") @@ -530,75 +559,6 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { } } -// --------------------------------------------------------------------------- -// Sub-helpers' unit coverage -// --------------------------------------------------------------------------- - -func TestParseArgoTrackingID(t *testing.T) { - cases := []struct { - in string - wantNS string - wantName string - wantOK bool - shortName string - }{ - {"argocd_store:apps/Deployment:prod/web", "argocd", "store", true, "namespaced form"}, - {"store:apps/Deployment:prod/web", "", "store", true, "legacy form"}, - {"", "", "", false, "empty"}, - {":foo/bar", "", "", false, "missing head"}, - {"a_:foo", "", "", false, "missing name"}, - } - for _, c := range cases { - t.Run(c.shortName, func(t *testing.T) { - ns, name, ok := parseArgoTrackingID(c.in) - if ns != c.wantNS || name != c.wantName || ok != c.wantOK { - t.Errorf("parseArgoTrackingID(%q) = (%q, %q, %v) want (%q, %q, %v)", - c.in, ns, name, ok, c.wantNS, c.wantName, c.wantOK) - } - }) - } -} - -func TestGroupFromAPIVersion(t *testing.T) { - cases := map[string]string{ - "v1": "", - "apps/v1": "apps", - "argoproj.io/v1alpha1": "argoproj.io", - "networking.k8s.io/v1": "networking.k8s.io", - "helm.toolkit.fluxcd.io/v2beta1": "helm.toolkit.fluxcd.io", - } - for in, want := range cases { - if got := groupFromAPIVersion(in); got != want { - t.Errorf("groupFromAPIVersion(%q) = %q, want %q", in, got, want) - } - } -} - -func TestPickControllerOwner_PrefersController(t *testing.T) { - owners := []metav1.OwnerReference{ - {Kind: "Other", Name: "x"}, - {Kind: "Boss", Name: "ctrl", Controller: ptrBool(true)}, - } - got := pickControllerOwner(owners) - if got == nil || got.Name != "ctrl" { - t.Errorf("got %+v, want ctrl", got) - } -} - -func TestPickControllerOwner_FallsBackToFirst(t *testing.T) { - owners := []metav1.OwnerReference{ - {Kind: "Solo", Name: "first"}, - {Kind: "Other", Name: "x"}, - } - got := pickControllerOwner(owners) - if got == nil || got.Name != "first" { - t.Errorf("got %+v, want first", got) - } - if got := pickControllerOwner(nil); got != nil { - t.Errorf("nil owners should return nil, got %+v", got) - } -} - // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- From 99c878c5c135fb5b5db16856c7f56177740ab112 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:30:53 +0300 Subject: [PATCH 05/33] fix(security): RBAC preflight on /api/ai/resources GET handler handleAIGetResource skipped the user-RBAC gates that handleGetResource runs, so a user with namespace access but no per-namespace get-secrets SAR could read Secret values via the AI endpoint, and a user without cluster-scoped node SAR could read Node objects. The AI surface returns the same resource bytes (just minified + wrapped) as the REST surface, so it must enforce the same gates. Extract the gate block into Server.preflightResourceGet and call it from both handlers. Single helper keeps the two endpoints in lockstep so future RBAC adjustments touch one place. Tests cover the three deny arms (per-ns get-secrets, cluster-scoped get-node, namespace access) plus a passing-case sanity check on the AI envelope shape. --- internal/server/ai_handlers.go | 11 +++ internal/server/ai_handlers_rbac_test.go | 101 +++++++++++++++++++++++ internal/server/server.go | 82 +++++++++++------- 3 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 internal/server/ai_handlers_rbac_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 3b27b9021..e8eff3284 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -182,6 +182,17 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { namespace = "" } + // Run the same RBAC preflight as handleGetResource — the AI endpoint + // returns the same resource bytes (just minified) and must gate on the + // same per-user SAR / namespace-access tuple. Without this, a user with + // no `get secrets` SAR could read Secret values via /api/ai/resources/… + // even though /api/resources/… correctly returns 403. Runs BEFORE the + // fetch so cluster-scoped denies don't leak existence by status code. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go new file mode 100644 index 000000000..0279bf243 --- /dev/null +++ b/internal/server/ai_handlers_rbac_test.go @@ -0,0 +1,101 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + + "github.com/skyhook-io/radar/internal/auth" +) + +// RBAC preflight on /api/ai/resources/{kind}/{namespace}/{name}. +// +// The AI single-resource GET returns the same resource bytes (just minified +// + wrapped in a resourceContext block) as /api/resources/{kind}/{ns}/{name}. +// It must therefore enforce the same per-user RBAC gates that +// handleGetResource enforces — otherwise a user could read Secret values via +// the AI surface even when the REST surface correctly returns 403. +// +// Both handlers call s.preflightResourceGet, so these tests pin the AI +// endpoint's gates (and a regression that bypasses the helper on the AI side +// would surface here even if the REST tests still pass). + +func TestProxyAuth_AIGetSecret_PerNamespaceRBAC_Denied(t *testing.T) { + // alice has namespace access to "default" but the per-namespace + // canRead("","secrets","default","get") returns false. The cache holds + // nginx-tls (seeded as the SA which has cluster-wide secrets RBAC), + // so without the preflight a 200 would leak secret bytes. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretGetCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secret/default/nginx-tls", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-secret without per-ns get SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetNode_ClusterScopedRBAC_Denied(t *testing.T) { + // Node is cluster-scoped — the AI GET must require per-kind get-node SAR. + // AllowedNamespaces==nil (cluster-wide-namespace sentinel) is NOT a + // license to read cluster-scoped kinds: that's the exact conflation the + // preflight helper guards against. A regression that dropped the + // ClassifyKindScope arm would let nodes through here. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("get", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/node/_/worker-1", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-node without cluster-scoped get-node SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceDenied(t *testing.T) { + // alice has namespace access only to "default" — a get against a pod + // in "kube-system" must 403 BEFORE any fetch, matching handleGetResource. + // A regression that fetched first and then filtered would let timing + // signal whether the pod exists (oracle). + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/kube-system/some-pod", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-pod in disallowed namespace, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceAllowed(t *testing.T) { + // Sanity check: a user with namespace access AND who hits an existing + // resource gets a 200 with the {resource, resourceContext} envelope. + // Pins that the preflight isn't accidentally over-gating happy-path + // requests (e.g., a misordered check that always denies). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/default/nginx-abc-xyz", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200 on allowed AI get-pod, got %d", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + if _, ok := body["resource"]; !ok { + t.Errorf("expected 'resource' field in AI get response, got: %+v", body) + } + if _, ok := body["resourceContext"]; !ok { + t.Errorf("expected 'resourceContext' field in AI get response, got: %+v", body) + } +} diff --git a/internal/server/server.go b/internal/server/server.go index b4e350187..23e81e941 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1470,29 +1470,23 @@ func setTypeMeta(resource any) { k8s.SetTypeMeta(resource) } -func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - namespace := chi.URLParam(r, "namespace") - name := chi.URLParam(r, "name") - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace - if namespace == "_" { - namespace = "" - } - - // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are - // gated per-kind via SAR. Run BEFORE the namespace access check so - // users with explicit cluster-scoped RBAC but no namespace access can - // still get the resource. ClassifyKindScope catches both static cluster- - // only kinds and dynamic cluster-scoped CRDs (via discovery). - // - // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user - // filtered list — gate the GET via the user's namespace access for the - // requested name, not via cluster-scoped SAR. +// preflightResourceGet runs the per-user RBAC gates that must pass before any +// single-resource GET fetch. Mirrors the kind/scope-aware logic used by both +// the REST handler (handleGetResource) and the AI handler (handleAIGetResource) +// so future RBAC adjustments stay in lockstep across both surfaces. +// +// Inputs are the already-normalized (kind, namespace, name, group); callers +// must collapse the cluster-scoped "_" placeholder before calling. Returns +// (status, message, ok=true) when the request passes the gates, or +// (status, message, ok=false) with the HTTP status + body the caller should +// emit on deny. +// +// Three gates, run in this order: +// 1. kind == "namespaces" → full Namespace object requires get-namespaces SAR +// 2. cluster-scoped (Node/CRD/…) → per-kind get SAR (ClassifyKindScope) +// 3. namespaced → namespace access via getUserNamespaces, +// plus per-namespace get SAR for Secrets +func (s *Server) preflightResourceGet(r *http.Request, kind, namespace, name, group string) (int, string, bool) { isNamespacesKind := kind == "namespaces" || kind == "namespace" isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) switch { @@ -1502,30 +1496,56 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { // imply read access to the Namespace object itself. Restricted users // without ClusterRole on namespaces get 403 here. if !s.canRead(r, "", "namespaces", "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name), false } case isClusterScoped: if !s.canRead(r, gvrGroup, gvrResource, "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind)) - return + return http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind), false } case namespace != "": // Namespaced kind: verify namespace access. allowed := s.getUserNamespaces(r, []string{namespace}) if noNamespaceAccess(allowed) { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace), false } // Per-kind RBAC inside the namespace for Secrets — the chart can // grant the SA cluster-wide secrets (Helm release visibility), so // namespace-list discovery is not a sufficient gate here. The list // handler has the matching list-SAR. if (kind == "secrets" || kind == "secret") && !s.canRead(r, "", "secrets", namespace, "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace), false } } + return 0, "", true +} + +func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace + if namespace == "_" { + namespace = "" + } + + // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are + // gated per-kind via SAR. Run BEFORE the namespace access check so + // users with explicit cluster-scoped RBAC but no namespace access can + // still get the resource. ClassifyKindScope catches both static cluster- + // only kinds and dynamic cluster-scoped CRDs (via discovery). + // + // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user + // filtered list — gate the GET via the user's namespace access for the + // requested name, not via cluster-scoped SAR. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } cache := k8s.GetResourceCache() if cache == nil { From d0a91c7ee58534c91e2754c37299ab93b00726ae Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:03:40 +0300 Subject: [PATCH 06/33] fix(resourcecontext): audit cross-Kind contamination + RunsOn fallback + selectedBy mislabel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer's critical (#721): computeAuditSummaryForResource dropped the Kind from its signature and matched only (namespace, name) by iterating the audit index. A Deployment "web" in "prod" silently inherited audit findings from a Service or ConfigMap of the same name in the same namespace. The loop's map-iteration order also made TopFinding non-deterministic across runs for tied severities — breaking the determinism guarantee SynthesizeHints downstream advertises. Fix derives the canonical kind from obj's TypeMeta in buildAIResourceContext (Pascal singular — exactly what audit's check runner writes into Finding.Kind) and threads it into the audit lookup. The lookup is now a single map access via bpaudit.ResourceKey instead of an O(n) scan. TopFinding selection sorts by (severity desc, CheckID asc) so ties resolve identically every run. Important fixes also addressed: - RunsOn fallback gap (pkg/resourcecontext/build.go:197): the `else if rel == nil` guard meant pod.Spec.NodeName was used only when there was no topology at all. When topology was present but rel.Node was nil (Node informer cold, node not yet indexed), RunsOn stayed empty even though the Pod spec named a node. Now falls back any time rel.Node is missing. - selectedByHint mislabel (pkg/resourcecontext/hints.go:124): every non-PDB ref was rendered as NetworkPolicy, including any future kind added to SelectedBy. Now explicit-match each known kind (PodDisruptionBudget, NetworkPolicy) and drop unrecognized kinds through summarizeKindsCounts instead of mislabeling. --- internal/server/ai_handlers.go | 58 +++++++++++++++++++++------------- pkg/resourcecontext/build.go | 11 +++++-- pkg/resourcecontext/hints.go | 21 ++++++++---- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index e8eff3284..a6afc6231 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net/http" + "sort" "strings" "github.com/go-chi/chi/v5" @@ -281,8 +282,18 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin } cache := k8s.GetResourceCache() + // Canonical kind from the resource's own TypeMeta (set at fetch). Pascal + // singular — matches what the audit check runner writes into Finding.Kind, + // so the audit index lookup keys correctly. Falls back to the URL kind + // only when TypeMeta is somehow empty; non-canonical input there would + // silently mis-key the audit lookup. + canonicalKind := obj.GetObjectKind().GroupVersionKind().Kind + if canonicalKind == "" { + canonicalKind = kind + } + issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) - auditSum := computeAuditSummaryForResource(cache, namespace, name) + auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, @@ -411,11 +422,19 @@ func composeSeverityRank(s issues.Severity) int { } // computeAuditSummaryForResource looks up audit findings for the subject -// resource. Uses pkg/audit.IndexByResource so the lookup is keyed on the -// canonical (Kind/ns/name) tuple — handles plural→singular normalization -// via the Finding.Kind values written by the check runner. -func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name string) *resourcecontext.AuditSummary { - if cache == nil { +// resource via the canonical (Kind/ns/name) tuple. kind MUST be the Pascal +// singular form the audit check runner writes into Finding.Kind (e.g. "Pod", +// not "pod" or "pods") — the caller derives it from obj's TypeMeta. Without +// a Kind-aware key, a Deployment "web" in "prod" would inherit findings +// from a Service "web" in the same namespace, since map iteration in the +// previous implementation only compared (namespace, name). +// +// TopFinding is selected deterministically: highest severity wins, with +// CheckID as the ascending tiebreaker. Map iteration ordering does NOT +// influence the choice — relevant because SynthesizeHints downstream +// advertises deterministic output. +func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil || kind == "" { return nil } results := audit.RunFromCache(cache, []string{namespace}, nil) @@ -423,27 +442,22 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name st return nil } idx := bpaudit.IndexByResource(results.Findings) - var match []bpaudit.Finding - for key, fs := range idx { - parts := strings.SplitN(key, "/", 3) - if len(parts) != 3 { - continue - } - if parts[1] == namespace && parts[2] == name { - match = append(match, fs...) - } - } + match := idx[bpaudit.ResourceKey(kind, namespace, name)] if len(match) == 0 { return nil } - var topSeverity, topFinding string - for _, f := range match { - if topSeverity == "" || auditSeverityRank(f.Severity) > auditSeverityRank(topSeverity) { - topSeverity = f.Severity - topFinding = f.CheckID + // Sort by (severity desc, CheckID asc) so TopFinding is deterministic + // across runs even when multiple findings tie on severity. + sort.Slice(match, func(i, j int) bool { + ri, rj := auditSeverityRank(match[i].Severity), auditSeverityRank(match[j].Severity) + if ri != rj { + return ri > rj } - } + return match[i].CheckID < match[j].CheckID + }) + topSeverity := match[0].Severity + topFinding := match[0].CheckID return &resourcecontext.AuditSummary{ Count: len(match), HighestSeverity: topSeverity, diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index ea69d041f..e8ef0f93a 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -194,13 +194,18 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } } - // RunsOn: prefer the topology-supplied Node ref; fall back to - // pod.Spec.NodeName only when topology is absent (no rel). + // RunsOn: prefer the topology-supplied Node ref. Fall back to + // pod.Spec.NodeName any time rel.Node is empty — the Node informer + // may be cold, the node may not yet be in the topology graph, or + // rel itself may be nil. The previous `else if rel == nil` guard + // dropped the fallback when topology was built but rel.Node hadn't + // been populated yet, leaving RunsOn empty even though the Pod + // spec clearly named a node. var nodeName, nodeGroup string if rel != nil && rel.Node != nil { nodeName = rel.Node.Name nodeGroup = rel.Node.Group - } else if rel == nil { + } else { nodeName = pod.Spec.NodeName } if nodeName != "" { diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go index 30ccd698b..a385f455f 100644 --- a/pkg/resourcecontext/hints.go +++ b/pkg/resourcecontext/hints.go @@ -125,23 +125,32 @@ func selectedByHint(refs []ContextRef) string { if len(refs) == 0 { return "" } - // Distinguish PDB vs NetworkPolicy in the hint — they read very - // differently to a human, and lumping them together loses signal. - var pdb, np []ContextRef + // Distinguish known SelectedBy kinds (PDB vs NetworkPolicy) in the hint — + // they read very differently to a human, and lumping them together loses + // signal. Match each kind explicitly: a future kind added to SelectedBy + // (e.g. ValidatingAdmissionPolicy) would otherwise be silently rendered + // as NetworkPolicy. Unrecognized kinds drop through to summarizeKindsCounts. + var pdb, np, other []ContextRef for _, r := range refs { - if r.Kind == "PodDisruptionBudget" { + switch r.Kind { + case "PodDisruptionBudget": pdb = append(pdb, r) - } else { + case "NetworkPolicy": np = append(np, r) + default: + other = append(other, r) } } - parts := make([]string, 0, 2) + parts := make([]string, 0, 3) if n := len(np); n > 0 { parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) } if n := len(pdb); n > 0 { parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) } + if len(other) > 0 { + parts = append(parts, summarizeKindsCounts(other)) + } return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) } From 74ff1d561caf202b673390bbe39ebf7b99686506 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:45:25 +0300 Subject: [PATCH 07/33] fix(security): route group-qualified AI GET to dynamic cache to avoid kind collisions fetchAIResource was calling FetchResource (typed cache) before consulting the group qualifier. For /api/ai/resources/services/ns/name?group= serving.knative.dev, the typed cache returns the core/v1 Service and the ?group= is silently dropped, leaking the wrong object via the AI surface. Branch on group != "" first and route directly to GetDynamicWithGroup, mirroring handleGetResource's dispatch in server.go. The typed-first path remains correct (and faster) when no group is passed. Same bug class as T12's group-blind root lookup, on the single-resource GET path. --- internal/k8s/testing.go | 52 ++++++++ internal/server/ai_handlers.go | 16 +++ internal/server/ai_handlers_group_test.go | 149 ++++++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 internal/server/ai_handlers_group_test.go diff --git a/internal/k8s/testing.go b/internal/k8s/testing.go index 799b2811e..560cbf632 100644 --- a/internal/k8s/testing.go +++ b/internal/k8s/testing.go @@ -4,7 +4,9 @@ import ( "sync" "github.com/skyhook-io/radar/pkg/k8score" + "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + fakeclientset "k8s.io/client-go/kubernetes/fake" ) // InitTestResourceCache creates a resource cache from a fake or test client, @@ -68,6 +70,56 @@ func InitTestResourceCache(client kubernetes.Interface) error { return nil } +// InitTestDynamicResourceCache wires the dynamic resource cache and discovery +// singletons against test fakes. Pass a dynamic client (typically from +// dynamicfake.NewSimpleDynamicClientWithCustomListKinds) and the set of +// APIResources to register in discovery. Each registered resource gets a GVR +// entry that group-qualified lookups (GetGVRWithGroup) and dynamic informers +// can resolve. +// +// Callers should defer ResetTestDynamicState — without it, the dynamic +// singletons leak into other tests that share TestMain state. +// +// This is intended for integration tests only. +func InitTestDynamicResourceCache(dynClient dynamic.Interface, resources []APIResource) error { + clientMu.Lock() + dynamicClient = dynClient + clientMu.Unlock() + + // Bootstrap discovery from a fake clientset so NewResourceDiscovery has a + // non-nil discovery client; AddAPIResource then registers the test-only + // GVRs (e.g. serving.knative.dev/Service) the test depends on. + fakeDisc := fakeclientset.NewSimpleClientset().Discovery() + core, err := k8score.NewResourceDiscovery(fakeDisc) + if err != nil { + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() + return err + } + for _, r := range resources { + core.AddAPIResource(r) + } + + discoveryMu.Lock() + resourceDiscovery = &ResourceDiscovery{ResourceDiscovery: core} + discoveryOnce = new(sync.Once) + discoveryOnce.Do(func() {}) + discoveryMu.Unlock() + + return InitDynamicResourceCache(nil) +} + +// ResetTestDynamicState tears down the dynamic cache + discovery singletons +// and clears the dynamic client. Pairs with InitTestDynamicResourceCache. +func ResetTestDynamicState() { + ResetDynamicResourceCache() + ResetResourceDiscovery() + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() +} + // SetTestContextName is a test-only helper that overrides the package-level // kubeconfig context name. Used by tests that exercise per-context state // (e.g. namespace preferences) without needing to spin up a real client. diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index a6afc6231..6ebab36f3 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -230,7 +230,23 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { // fetchAIResource resolves the resource from the typed cache or dynamic cache. // The bool reports whether the returned object is an unstructured (CRD) value. +// +// When a group is provided, the typed cache is skipped entirely and the +// dynamic cache is consulted with the group qualifier. This prevents kind +// collisions where a CRD plural shadows a core kind (e.g., Knative +// serving.knative.dev/Service vs core/v1 Service): without this branch, +// FetchResource("services", ...) would return the core Service from the +// typed informer and the requested group would never be consulted, leaking +// the wrong object via the AI surface. Mirrors handleGetResource's +// group-first dispatch in server.go. func (s *Server) fetchAIResource(ctx context.Context, cache *k8s.ResourceCache, kind, namespace, name, group string) (runtime.Object, bool, error) { + if group != "" { + u, err := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if err != nil { + return nil, false, err + } + return u, true, nil + } obj, err := k8s.FetchResource(cache, kind, namespace, name) if err == nil { return obj, false, nil diff --git a/internal/server/ai_handlers_group_test.go b/internal/server/ai_handlers_group_test.go new file mode 100644 index 000000000..da8891066 --- /dev/null +++ b/internal/server/ai_handlers_group_test.go @@ -0,0 +1,149 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynamicfake "k8s.io/client-go/dynamic/fake" + + "github.com/skyhook-io/radar/internal/k8s" +) + +// Group-qualified AI GET must route to the dynamic cache so CRDs whose +// plural shadows a core kind (Knative serving.knative.dev/Service vs +// core/v1 Service) resolve to the requested object — not whichever the +// typed cache happens to hold under that kind/name pair. +// +// Without the group-first branch in fetchAIResource, FetchResource( +// "services", ...) returns the core/v1 Service from the typed informer +// and ?group=serving.knative.dev is silently dropped. The bug surfaces +// as wrong-object disclosure on the AI surface: a caller asking for the +// Knative Service receives the core Service's spec + IP + selector +// instead. This pins the fix and would regress if the typed cache is +// consulted before the group qualifier. +// +// Same bug class as T12's group-blind root lookup, but on the single- +// resource GET path; ResourceContext relationship walks already disambig +// by group (see pkg/topology/managedby_test.go), so a regression here is +// the last remaining hot spot for kind/plural collisions on the GET API. +func TestAIGetResource_GroupRoutesToDynamic(t *testing.T) { + // Seed a Knative Service named "nginx" in "default" — same name+ns as + // the core Service registered in TestMain. Without ?group routing, the + // typed cache wins and returns the core Service. With it, the dynamic + // cache returns the Knative Service. + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + // Warm the informer so the Get() call below sees the seeded object + // without racing on initial sync. + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev&context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + // context=none returns the minified resource directly (no envelope). + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "serving.knative.dev/v1" { + t.Fatalf("apiVersion = %q, want serving.knative.dev/v1 — group qualifier was ignored "+ + "and the typed cache's core Service was returned instead", apiVersion) + } + kind, _ := body["kind"].(string) + if kind != "Service" { + t.Errorf("kind = %q, want Service", kind) + } + // Cross-check: the core Service has a Spec.Selector / ClusterIP shape + // that the Knative seed does NOT have. A regression that returned the + // core Service would carry those fields here. + spec, _ := body["spec"].(map[string]any) + if _, hasSelector := spec["selector"]; hasSelector { + t.Errorf("response carries Service.spec.selector — looks like the core Service leaked through "+ + "despite ?group=serving.knative.dev; body=%+v", body) + } +} + +// Happy-path sibling for the test above: when no group is passed, the +// typed-cache-first path is correct (and must continue to be — the v1 +// core Service is the dominant case and must not pay a dynamic-cache +// detour just because the group-qualified branch was added). +func TestAIGetResource_NoGroupHitsTypedCache(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "v1" { + t.Fatalf("apiVersion = %q, want v1 (core Service) on no-group request", apiVersion) + } +} From 04d8d40163af640c255fd4a1a13350de71ff2ad5 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 03:35:19 +0300 Subject: [PATCH 08/33] feat(resourcecontext): attach summaryContext to list_resources + search hits (T8+T9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pkg/resourcecontext.BuildSummary — a tiny per-row enrichment helper that returns ManagedBy + Health + IssueCount (≤ ~60 bytes/row). Wire it through: - REST /api/ai/resources/{kind} list handler (handleAIListResources + aiListDynamic) - MCP list_resources tool (handleListResources + listDynamicResources) - REST /api/search executor (Hit.SummaryContext) - MCP search tool (handleSearch) Each handler builds the per-namespace issue index (via internal/issues.Compose) and topology snapshot once per request, then invokes BuildSummary per row. pkg/resourcecontext stays free of internal/* and pkg/topology imports — callers pre-compute ManagedBy via the new ManagedByFromOwner helper. All four surfaces honor a context=none opt-out (query param for REST, input arg for MCP) that returns bare rows. Tests: - Golden-file BuildSummary across Pod phases, Deployment replica states, NetworkPolicy (no health heuristic), CRD Ready/Available conditions, Health override, nil-object safety. - ManagedByFromOwner source classification (argocd / flux / native). - attach* helpers wired end-to-end with stub builders; defensive length- mismatch handling. - managedByFromRelationships prefers Deployment grandparent shortcut over noisy ReplicaSet owner. - Search executor invokes SummaryBuilder per kept hit when set, leaves SummaryContext nil when unset. --- internal/mcp/summary_context.go | 179 +++++++++++++ internal/mcp/tools.go | 74 +++++- internal/search/search.go | 31 ++- internal/search/summary_context_test.go | 85 ++++++ internal/search/types.go | 9 +- internal/server/ai_handlers.go | 63 ++++- internal/server/search_handler.go | 9 + internal/server/summary_context.go | 180 +++++++++++++ internal/server/summary_context_test.go | 215 ++++++++++++++++ pkg/ai/context/summary.go | 8 + pkg/resourcecontext/summary.go | 188 ++++++++++++++ pkg/resourcecontext/summary_test.go | 327 ++++++++++++++++++++++++ 12 files changed, 1351 insertions(+), 17 deletions(-) create mode 100644 internal/mcp/summary_context.go create mode 100644 internal/search/summary_context_test.go create mode 100644 internal/server/summary_context.go create mode 100644 internal/server/summary_context_test.go create mode 100644 pkg/resourcecontext/summary.go create mode 100644 pkg/resourcecontext/summary_test.go diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go new file mode 100644 index 000000000..43cfd7108 --- /dev/null +++ b/internal/mcp/summary_context.go @@ -0,0 +1,179 @@ +// Per-request helpers that compute the compact SummaryContext attached +// to list_resources rows and search hits served via MCP. Mirrors the +// equivalent helpers in internal/server (REST list + search). Kept +// separate so MCP doesn't pull in the server package. + +package mcp + +import ( + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// summaryContextBuilder is the per-request closure that produces a +// SummaryContext for a single resource. nil result is fine — the +// SummaryContext field is omitempty on every consumer. +type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext + +// newSummaryContextBuilder assembles the per-request closure for MCP +// list_resources / search. Returns nil when the cache or topology +// isn't available, in which case the caller should skip context +// attachment rather than emit empty objects. +// +// namespaces scopes the issue index to just the rows being returned; +// pass nil for cluster-wide. kindFilter ("" for search, the requested +// kind for list_resources) narrows the issue compose to a single kind +// so list_resources kind=pod doesn't pull deployment + service issues. +func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + topo := buildSummaryContextTopology(namespaces) + idx := buildIssueIndex(provider, namespaces, kindFilter) + + resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) + dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + var managedBy *resourcecontext.ManagedByRef + if topo != nil { + rel := topology.GetRelationships(kind, namespace, name, topo, resourceProvider, dynamicProvider) + managedBy = managedByFromRelationships(rel) + } + var source runtime.Object = obj + if source == nil && u != nil { + source = u + } + return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ + ManagedBy: managedBy, + IssueCount: idx.count(kind, namespace, name), + }) + } +} + +// buildSummaryContextTopology builds a topology snapshot suitable for +// resolving managedBy pointers. MCP has no shared broadcaster cache, +// so we build directly via the builder. Returns nil on failure — the +// caller falls back to a managedBy-less SummaryContext rather than +// failing the response. +func buildSummaryContextTopology(namespaces []string) *topology.Topology { + cache := k8s.GetResourceCache() + if cache == nil { + return nil + } + builder := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery())) + opts := topology.DefaultBuildOptions() + if len(namespaces) > 0 { + opts.Namespaces = namespaces + } + topo, err := builder.Build(opts) + if err != nil { + return nil + } + return topo +} + +// issueIndex keys per-resource issue counts as "kind|namespace|name". +// Kind is canonicalized via canonicalSingular because issue sources emit +// the kind as-typed (Deployment) while callers may pass the URL plural +// (deployments) — canonicalization normalizes both. +type issueIndex map[string]int + +func (i issueIndex) count(kind, namespace, name string) int { + return i[issueIndexKey(kind, namespace, name)] +} + +func issueIndexKey(kind, namespace, name string) string { + return strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name +} + +func canonicalSingular(kind string) string { + k := strings.ToLower(kind) + switch k { + case "pods": + return "pod" + case "services": + return "service" + case "deployments": + return "deployment" + case "daemonsets": + return "daemonset" + case "statefulsets": + return "statefulset" + case "replicasets": + return "replicaset" + case "jobs": + return "job" + case "cronjobs": + return "cronjob" + case "ingresses": + return "ingress" + case "configmaps": + return "configmap" + case "secrets": + return "secret" + case "persistentvolumeclaims": + return "persistentvolumeclaim" + case "persistentvolumes": + return "persistentvolume" + case "storageclasses": + return "storageclass" + case "horizontalpodautoscalers", "hpas", "hpa": + return "horizontalpodautoscaler" + case "poddisruptionbudgets": + return "poddisruptionbudget" + case "nodes": + return "node" + case "namespaces": + return "namespace" + case "events": + return "event" + } + return k +} + +func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { + filters := issues.Filters{ + Namespaces: namespaces, + Limit: issues.MaxLimit, + } + if kindFilter != "" { + filters.Kinds = []string{canonicalSingular(kindFilter)} + } + composed := issues.Compose(p, filters) + idx := make(issueIndex, len(composed)) + for _, iss := range composed { + idx[issueIndexKey(iss.Kind, iss.Namespace, iss.Name)]++ + } + return idx +} + +// managedByFromRelationships extracts a compact ManagedByRef from +// computed topology relationships. Preference: Deployment grandparent +// shortcut (Pods owned by ReplicaSets surface the controlling +// Deployment, not the noisy hash-suffixed RS), then direct Owner. +func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { + if rel == nil { + return nil + } + var ref *topology.ResourceRef + switch { + case rel.Deployment != nil: + ref = rel.Deployment + case rel.Owner != nil: + ref = rel.Owner + default: + return nil + } + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) +} + diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index d76d9b612..64a5ae6cb 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -12,7 +12,9 @@ import ( "github.com/modelcontextprotocol/go-sdk/mcp" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/filter" "github.com/skyhook-io/radar/internal/helm" @@ -315,6 +317,7 @@ type listResourcesInput struct { Kind string `json:"kind" jsonschema:"resource kind to list, e.g. pods, deployments, services, configmaps"` Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. serving.knative.dev for Knative Service vs core Service)"` Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` + Context string `json:"context,omitempty" jsonschema:"per-row context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare rows"` } type getResourceInput struct { @@ -358,6 +361,7 @@ type searchInput struct { Limit int `json:"limit,omitempty" jsonschema:"max hits returned (default 50, max 500)"` Include string `json:"include,omitempty" jsonschema:"per-hit detail: summary (default), raw, or none"` Filter string `json:"filter,omitempty" jsonschema:"optional CEL boolean expression run against each candidate K8s object. Bindings: kind, apiVersion, metadata, spec, status, labels, annotations. Use has(x.y) before optional fields. Examples: 'kind == \"Pod\" && status.phase == \"Failed\"', 'labels[\"app\"] == \"cart\"', 'has(status.readyReplicas) && status.readyReplicas == 0'"` + Context string `json:"context,omitempty" jsonschema:"per-hit context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare hits"` } type issuesInput struct { @@ -470,7 +474,7 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li // Fall through to dynamic cache for CRDs. ClassifyKindScope/SAR // above already authorized cluster-scoped CRDs; namespaced CRDs // are scoped via listScope. - return listDynamicResources(ctx, cache, kind, group, listScope) + return listDynamicResources(ctx, cache, kind, group, listScope, input.Context) } if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) @@ -492,34 +496,83 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li return nil, nil, fmt.Errorf("failed to minify: %w", err) } + // Attach summaryContext per row unless caller opted out. Issue index + // is scoped to the listed kind so the per-row count reflects only + // the resource being listed (not unrelated noise in the namespace). + if input.Context != "none" { + if builder := newSummaryContextBuilder(allowed, kind); builder != nil { + attachSummaryContextToTyped(results, objs, builder) + } + } + return toJSONResult(results) } -func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string) (*mcp.CallToolResult, any, error) { - var allItems []any +// attachSummaryContextToTyped fills in SummaryContext on each +// Summary-verbosity ResourceSummary in-place. results and objs are +// produced in lockstep by MinifyList — a length mismatch is defensive +// (skip rather than panic). +func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder summaryContextBuilder) { + if len(results) != len(objs) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + summary.SummaryContext = builder(objs[i], nil, summary.Kind, summary.Namespace, summary.Name) + } +} + +func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, contextMode string) (*mcp.CallToolResult, any, error) { + var rawItems []*unstructured.Unstructured if len(namespaces) > 0 { for _, ns := range namespaces { items, err := cache.ListDynamicWithGroup(ctx, kind, ns, group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) - } + rawItems = append(rawItems, items...) } } else { items, err := cache.ListDynamicWithGroup(ctx, kind, "", group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + rawItems = items + } + + allItems := make([]any, 0, len(rawItems)) + for _, item := range rawItems { + allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + } + + if contextMode != "none" { + if builder := newSummaryContextBuilder(namespaces, kind); builder != nil { + attachSummaryContextToUnstructured(allItems, rawItems, builder) } } return toJSONResult(allItems) } +// attachSummaryContextToUnstructured fills in SummaryContext for the +// dynamic-CRD list path. summarizeUnstructured returns +// *aicontext.ResourceSummary, so the cast matches the typed path. +func attachSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { + if len(results) != len(items) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + summary.SummaryContext = builder(nil, items[i], summary.Kind, summary.Namespace, summary.Name) + } +} + func handleGetResource(ctx context.Context, req *mcp.CallToolRequest, input getResourceInput) (*mcp.CallToolResult, any, error) { cache := k8s.GetResourceCache() if cache == nil { @@ -2048,6 +2101,11 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp } opts.Filter = f } + if input.Context != "none" { + if builder := newSummaryContextBuilder(scanNamespaces, ""); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } result, err := search.Search(ctx, provider, parsed, opts) if err != nil { return nil, nil, err diff --git a/internal/search/search.go b/internal/search/search.go index 4d05f75c7..575970d6f 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -22,8 +22,17 @@ import ( "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" ) +// SummaryBuilderFunc, when supplied via Options.SummaryBuilder, is +// invoked once per matched hit to produce the compact per-row +// SummaryContext attached to the Hit. Exactly one of obj/u will be +// non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil +// is fine (the field is omitempty); callers use it to gate context +// emission per request (context=none opts out by passing nil here). +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext + // Provider abstracts the cache so tests can inject a fake. type Provider interface { ListTyped(kind string, namespaces []string) ([]runtime.Object, error) @@ -106,6 +115,13 @@ type Options struct { // drop the candidate. Compile happens in the handler; this layer // just runs the program. Filter *CELFilter + // SummaryBuilder, when non-nil, is invoked per matched hit to + // attach the compact SummaryContext (managedBy + health + + // issueCount). Handlers provide a closure that wraps the + // request-scoped topology + per-namespace issue index so the + // per-row cost stays flat. Pass nil to opt out (context=none) — + // the field is omitempty and consumers must tolerate its absence. + SummaryBuilder SummaryBuilderFunc } // Search runs the parsed query against the provider and returns ranked hits. @@ -204,7 +220,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, obj, nil)) + hits = append(hits, buildHit(score, matched, c, opts.Include, obj, nil, opts.SummaryBuilder)) } } @@ -270,7 +286,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, nil, u)) + hits = append(hits, buildHit(score, matched, c, opts.Include, nil, u, opts.SummaryBuilder)) } } @@ -345,8 +361,12 @@ func isClusterScopedKind(kind string) bool { // buildHit assembles the response shape for a matched candidate. Exactly // one of obj/u will be non-nil. minify-on-demand keeps the cost of -// IncludeNone (identity-only) flat. -func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { +// IncludeNone (identity-only) flat. summaryBuilder, when non-nil, is +// invoked to attach the compact per-row SummaryContext — kept separate +// from Include because context applies to every verbosity (including +// IncludeNone identity-only hits), while Summary/Raw control the full +// minified body. +func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured, summaryBuilder SummaryBuilderFunc) Hit { h := Hit{ Score: score, Kind: c.Kind, @@ -375,5 +395,8 @@ func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, h.Raw = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) } } + if summaryBuilder != nil { + h.SummaryContext = summaryBuilder(obj, u, c.Kind, c.Namespace, c.Name) + } return h } diff --git a/internal/search/summary_context_test.go b/internal/search/summary_context_test.go new file mode 100644 index 000000000..4a9510e48 --- /dev/null +++ b/internal/search/summary_context_test.go @@ -0,0 +1,85 @@ +package search + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// TestSearch_SummaryBuilderAttached pins the wiring: when Options.SummaryBuilder +// is non-nil, the executor invokes it per kept hit and the result lands +// in Hit.SummaryContext. +func TestSearch_SummaryBuilderAttached(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, + }, + }, + }, + }, + } + + var calls int + builder := func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + calls++ + return &resourcecontext.SummaryContext{ + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: namespace}, + Health: "healthy", + IssueCount: 0, + } + } + + res, _ := Search(context.Background(), p, Parse("api-1"), Options{ + Include: IncludeNone, + SummaryBuilder: builder, + }) + if calls != 1 { + t.Fatalf("SummaryBuilder calls = %d, want 1", calls) + } + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + h := res.Hits[0] + if h.SummaryContext == nil { + t.Fatalf("SummaryContext not attached to hit: %+v", h) + } + if h.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", h.SummaryContext.Health) + } + if h.SummaryContext.ManagedBy == nil || h.SummaryContext.ManagedBy.Name != "api" { + t.Errorf("ManagedBy mismatch: %+v", h.SummaryContext.ManagedBy) + } +} + +// TestSearch_NoSummaryBuilder_LeavesNilContext is the opt-out path +// (context=none in the handler maps to nil SummaryBuilder here). Hits +// must have no SummaryContext. +func TestSearch_NoSummaryBuilder_LeavesNilContext(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + }, + }, + }, + } + res, _ := Search(context.Background(), p, Parse("api-1"), Options{Include: IncludeNone}) + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + if res.Hits[0].SummaryContext != nil { + t.Errorf("expected nil SummaryContext when SummaryBuilder unset, got %+v", res.Hits[0].SummaryContext) + } +} diff --git a/internal/search/types.go b/internal/search/types.go index 83551ba86..c37112b55 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -1,6 +1,9 @@ package search -import "github.com/skyhook-io/radar/internal/filter" +import ( + "github.com/skyhook-io/radar/internal/filter" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) const ( DefaultLimit = 50 @@ -46,6 +49,10 @@ type Hit struct { Summary any `json:"summary,omitempty"` Raw any `json:"raw,omitempty"` Matched []MatchedField `json:"matched,omitempty"` + // SummaryContext is the compact per-row enrichment (managedBy, health, + // issueCount). Populated by handlers via Options.SummaryBuilder; nil + // when the caller opted out (context=none) or no fields apply. + SummaryContext *resourcecontext.SummaryContext `json:"summaryContext,omitempty"` } // MatchedField records where a query token landed (debug + UI highlight). diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ea130bac..8349577c8 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -7,9 +7,10 @@ import ( "github.com/go-chi/chi/v5" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" "github.com/skyhook-io/radar/internal/k8s" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" ) // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. @@ -27,7 +28,11 @@ func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aico } // handleAIListResources returns a minified list of resources for AI consumption. -// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact +// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact&context=none +// +// summaryContext (managedBy + health + issueCount) is attached per row +// at Summary verbosity by default. Pass ?context=none to opt out for a +// bare list. func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return @@ -40,6 +45,7 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { } group := r.URL.Query().Get("group") level := parseVerbosity(r, aicontext.LevelSummary) + skipContext := r.URL.Query().Get("context") == "none" cache := k8s.GetResourceCache() if cache == nil { @@ -51,7 +57,7 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { objs, err := k8s.FetchResourceList(cache, kind, namespaces) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs - s.aiListDynamic(w, r, cache, kind, namespaces, group, level) + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) return } if err != nil { @@ -69,11 +75,54 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } + // Attach summaryContext per row at Summary verbosity. Compact/Detail + // already carry richer context on the get-resource path; the + // per-row attachment is specifically for cheap list triage. + if !skipContext && level == aicontext.LevelSummary { + if builder := s.newSummaryContextBuilder(namespaces, kind); builder != nil { + attachSummaryContextToList(results, objs, builder) + } + } + s.writeJSON(w, results) } +// attachSummaryContextToList walks the typed-cache list and assigns the +// per-row SummaryContext into each ResourceSummary in-place. results and +// objs are produced in lockstep by MinifyList; a length mismatch is +// defensive (and silently skips attachment rather than panicking) but +// shouldn't occur in practice. +func attachSummaryContextToList(results []any, objs []runtime.Object, builder summaryContextBuilder) { + if len(results) != len(objs) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + summary.SummaryContext = builder(objs[i], nil, summary.Kind, summary.Namespace, summary.Name) + } +} + +// attachSummaryContextToUnstructuredList does the same for the dynamic +// CRD path. MinifyUnstructured returns *ResourceSummary (Summary level) +// so the cast is the same shape. +func attachSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { + if len(results) != len(items) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + summary.SummaryContext = builder(nil, items[i], summary.Kind, summary.Namespace, summary.Name) + } +} + // aiListDynamic handles the CRD/dynamic fallback for AI list. -func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel) { +func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel, skipContext bool) { var allItems []*unstructured.Unstructured if len(namespaces) > 0 { @@ -107,6 +156,12 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 results = append(results, aicontext.MinifyUnstructured(item, level)) } + if !skipContext && level == aicontext.LevelSummary { + if builder := s.newSummaryContextBuilder(namespaces, kind); builder != nil { + attachSummaryContextToUnstructuredList(results, allItems, builder) + } + } + s.writeJSON(w, results) } diff --git a/internal/server/search_handler.go b/internal/server/search_handler.go index 486224ae4..cb8eb91ad 100644 --- a/internal/server/search_handler.go +++ b/internal/server/search_handler.go @@ -102,6 +102,15 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { return s.canRead(r, group, resource, "", "list") }, } + // summaryContext attaches managedBy/health/issueCount per hit. Build + // the per-request closure once (one Compose call + cached topology + // snapshot) and let the search executor invoke it per kept hit. + // ?context=none opts out so legacy callers don't pay for the join. + if r.URL.Query().Get("context") != "none" { + if builder := s.newSummaryContextBuilder(scanNamespaces, ""); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } if expr := r.URL.Query().Get("filter"); expr != "" { f, err := filter.CachedObjectFilter(expr) if err != nil { diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go new file mode 100644 index 000000000..08e090ad6 --- /dev/null +++ b/internal/server/summary_context.go @@ -0,0 +1,180 @@ +// Per-request helpers that compute the compact SummaryContext attached +// to /api/ai/resources/{kind} list rows and /api/search hits. +// +// The helpers build a single per-namespace issue index and a cached +// topology snapshot up front, then expose a closure callers invoke +// per row. This keeps the per-row cost flat — without the index, +// listing 2000 pods would re-walk the entire issue compose pipeline +// per row. +// +// pkg/resourcecontext intentionally has no dependencies on internal/* +// or pkg/topology; the join happens here. + +package server + +import ( + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// summaryContextBuilder is the per-request closure that produces a +// SummaryContext for a single resource. nil result is fine — the +// SummaryContext field is omitempty on every consumer. +type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext + +// newSummaryContextBuilder assembles the per-request closure for the +// list/search handlers. Returns nil when the cache or topology isn't +// available, in which case callers should skip context attachment +// rather than emit empty objects. +// +// Callers pass the namespace list they're scanning so the issue index +// is scoped to just those rows (the full Compose call on a 100-namespace +// cluster is fine; this is mostly belt-and-suspenders for very large +// envs). Pass nil to compose cluster-wide. +func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { + topo := s.broadcaster.GetCachedTopology() + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + + // One pass over the issue engine; group by kind/ns/name. Sources + // are restricted to "problem" + "condition" — the two always-on + // surfaces that match the default /api/issues + MCP issues_list + // behavior. Audit + Warning events are loud and require explicit + // opt-in; rolling them into the per-row count would distort + // "this Pod has 1 issue" for the common case. + idx := buildIssueIndex(provider, namespaces, kindFilter) + + resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) + dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + var managedBy *resourcecontext.ManagedByRef + if topo != nil { + rel := topology.GetRelationships(kind, namespace, name, topo, resourceProvider, dynamicProvider) + managedBy = managedByFromRelationships(rel) + } + var source runtime.Object = obj + if source == nil && u != nil { + source = u + } + return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ + ManagedBy: managedBy, + IssueCount: idx.count(kind, namespace, name), + }) + } +} + +// issueIndex keys per-resource issue counts as "kind|namespace|name". +// Kind is canonicalized via strings.ToLower because issue sources emit +// the kind as-typed (Deployment) while callers may pass the URL plural +// (deployments) — lowercase normalizes both. +type issueIndex map[string]int + +func (i issueIndex) count(kind, namespace, name string) int { + return i[issueIndexKey(kind, namespace, name)] +} + +func issueIndexKey(kind, namespace, name string) string { + return strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name +} + +// canonicalSingular collapses common plural forms back to the singular +// kind the issue engine emits. Cheap surface — only the kinds we +// actually scan in list_resources / search. +func canonicalSingular(kind string) string { + k := strings.ToLower(kind) + switch k { + case "pods": + return "pod" + case "services": + return "service" + case "deployments": + return "deployment" + case "daemonsets": + return "daemonset" + case "statefulsets": + return "statefulset" + case "replicasets": + return "replicaset" + case "jobs": + return "job" + case "cronjobs": + return "cronjob" + case "ingresses": + return "ingress" + case "configmaps": + return "configmap" + case "secrets": + return "secret" + case "persistentvolumeclaims": + return "persistentvolumeclaim" + case "persistentvolumes": + return "persistentvolume" + case "storageclasses": + return "storageclass" + case "horizontalpodautoscalers", "hpas", "hpa": + return "horizontalpodautoscaler" + case "poddisruptionbudgets": + return "poddisruptionbudget" + case "nodes": + return "node" + case "namespaces": + return "namespace" + case "events": + return "event" + } + return k +} + +func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { + filters := issues.Filters{ + Namespaces: namespaces, + Limit: issues.MaxLimit, + } + if kindFilter != "" { + // Compose's Kinds filter expects the singular kind ("Pod"). The + // caller may pass either the URL plural ("pods") or the singular — + // canonicalSingular normalizes both before issuing the filter. + filters.Kinds = []string{canonicalSingular(kindFilter)} + } + composed := issues.Compose(p, filters) + idx := make(issueIndex, len(composed)) + for _, iss := range composed { + idx[issueIndexKey(iss.Kind, iss.Namespace, iss.Name)]++ + } + return idx +} + +// managedByFromRelationships extracts a compact ManagedByRef from +// computed topology relationships. Preference order: +// 1. Deployment grandparent shortcut (Pods owned by ReplicaSets surface +// the controlling Deployment, not the noisy hash-suffixed RS). +// 2. Direct Owner — covers everything else (StatefulSet pod → STS, +// Job pod → Job, ArgoCD Application children, Flux Kustomization +// children, etc.). +// +// Returns nil when topology has no relationship for the resource. +func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { + if rel == nil { + return nil + } + var ref *topology.ResourceRef + switch { + case rel.Deployment != nil: + ref = rel.Deployment + case rel.Owner != nil: + ref = rel.Owner + default: + return nil + } + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) +} diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go new file mode 100644 index 000000000..417166c90 --- /dev/null +++ b/internal/server/summary_context_test.go @@ -0,0 +1,215 @@ +package server + +import ( + "encoding/json" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// stubBuilder records calls and returns a deterministic SummaryContext +// keyed by the resource identity. Avoids standing up a topology cache or +// issue provider — those are exercised by the per-layer unit tests. +func stubBuilder(t *testing.T, want map[string]*resourcecontext.SummaryContext) summaryContextBuilder { + t.Helper() + return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + key := kind + "|" + namespace + "|" + name + return want[key] + } +} + +// TestAttachSummaryContextToList wires together MinifyList + the +// per-row attach helper and asserts the SummaryContext field lands in +// the JSON each row marshals to. +func TestAttachSummaryContextToList(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-1", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning, ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}}, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-2", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + } + want := map[string]*resourcecontext.SummaryContext{ + "Pod|prod|api-1": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "healthy", + IssueCount: 0, + }, + "Pod|prod|api-2": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "unhealthy", + IssueCount: 3, + }, + } + + results, err := aicontext.MinifyList(objs, aicontext.LevelSummary) + if err != nil { + t.Fatalf("MinifyList: %v", err) + } + attachSummaryContextToList(results, objs, stubBuilder(t, want)) + + // Row 0 — healthy pod. + b, _ := json.Marshal(results[0]) + wantSubs := []string{ + `"summaryContext":`, + `"managedBy":{"kind":"Deployment"`, + `"health":"healthy"`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 0 missing %s in %s", sub, b) + } + } + + // Row 1 — unhealthy pod with issueCount. + b, _ = json.Marshal(results[1]) + wantSubs = []string{ + `"health":"unhealthy"`, + `"issueCount":3`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 1 missing %s in %s", sub, b) + } + } +} + +// TestAttachSummaryContextToList_MismatchedLengthsSilent — defensive +// path that protects against a future refactor where MinifyList might +// drop unsupported kinds. Attach must skip rather than panic. +func TestAttachSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "api-1"}}, + } + results := []any{ + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-1"}, + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, + } + // Length mismatch (1 obj vs 2 results) — must not panic, must skip. + attachSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + return &resourcecontext.SummaryContext{Health: "healthy"} + }) + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok { + t.Fatalf("row %d: unexpected type %T", i, row) + } + if summary.SummaryContext != nil { + t.Errorf("row %d: SummaryContext should be nil on length mismatch, got %#v", i, summary.SummaryContext) + } + } +} + +// TestAttachSummaryContextToUnstructuredList covers the dynamic-CRD +// path. summarizeUnstructured returns *ResourceSummary so the attach +// helper is symmetric with the typed path. +func TestAttachSummaryContextToUnstructuredList(t *testing.T) { + items := []*unstructured.Unstructured{ + {Object: map[string]any{ + "apiVersion": "argoproj.io/v1alpha1", + "kind": "Application", + "metadata": map[string]any{"name": "storefront", "namespace": "argocd"}, + "status": map[string]any{"conditions": []any{map[string]any{"type": "Ready", "status": "True"}}}, + }}, + } + want := map[string]*resourcecontext.SummaryContext{ + "Application|argocd|storefront": { + Health: "healthy", + IssueCount: 1, + }, + } + + results := []any{aicontext.MinifyUnstructured(items[0], aicontext.LevelSummary)} + attachSummaryContextToUnstructuredList(results, items, stubBuilder(t, want)) + + summary, ok := results[0].(*aicontext.ResourceSummary) + if !ok || summary == nil { + t.Fatalf("unexpected row type %T", results[0]) + } + if summary.SummaryContext == nil { + t.Fatalf("SummaryContext not attached") + } + if summary.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", summary.SummaryContext.Health) + } + if summary.SummaryContext.IssueCount != 1 { + t.Errorf("IssueCount = %d, want 1", summary.SummaryContext.IssueCount) + } +} + +// TestManagedByFromRelationships_PrefersDeployment pins the Pod → +// Deployment grandparent shortcut over the noisier ReplicaSet owner. +func TestManagedByFromRelationships_PrefersDeployment(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + Deployment: &topology.ResourceRef{Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + } + got := managedByFromRelationships(rel) + want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} + if got == nil || got.Kind != want.Kind || got.Name != want.Name || got.Namespace != want.Namespace || got.Source != want.Source { + t.Errorf("got %#v, want %#v", got, want) + } +} + +// TestManagedByFromRelationships_FallsBackToOwner covers the +// non-Pod case (StatefulSet → STS, Job pod → Job, etc.). +func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + } + got := managedByFromRelationships(rel) + if got == nil { + t.Fatalf("got nil, want Application ref") + } + if got.Source != "argocd" { + t.Errorf("Source = %q, want argocd", got.Source) + } +} + +func TestManagedByFromRelationships_NilSafe(t *testing.T) { + if got := managedByFromRelationships(nil); got != nil { + t.Errorf("nil rel: got %#v, want nil", got) + } + if got := managedByFromRelationships(&topology.Relationships{}); got != nil { + t.Errorf("empty rel: got %#v, want nil", got) + } +} + +// TestCanonicalSingular pins the kind normalization used to align URL +// plurals with the singular form the issue engine emits. +func TestCanonicalSingular(t *testing.T) { + cases := map[string]string{ + "pods": "pod", + "Pods": "pod", + "Deployment": "deployment", + "deployments": "deployment", + "hpa": "horizontalpodautoscaler", + "unknownkind": "unknownkind", + } + for in, want := range cases { + if got := canonicalSingular(in); got != want { + t.Errorf("canonicalSingular(%q) = %q, want %q", in, got, want) + } + } +} + +// contains is a tiny strings.Contains alias kept local so the test file +// doesn't need a strings import alongside the existing imports. +func contains(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +} diff --git a/pkg/ai/context/summary.go b/pkg/ai/context/summary.go index 47ece9d7f..62f53b651 100644 --- a/pkg/ai/context/summary.go +++ b/pkg/ai/context/summary.go @@ -14,6 +14,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/pkg/resourcecontext" ) // ResourceSummary is the typed output for Summary-level minification. @@ -68,6 +70,12 @@ type ResourceSummary struct { Capacity string `json:"capacity,omitempty"` AccessModes []string `json:"accessModes,omitempty"` Owner string `json:"owner,omitempty"` + + // SummaryContext is the per-row enrichment attached by AI-facing list + // surfaces (REST /api/ai/resources/{kind}, MCP list_resources, search + // hits). Populated by handlers post-minify via resourcecontext.BuildSummary; + // nil when the caller opted out (?context=none) or when no fields apply. + SummaryContext *resourcecontext.SummaryContext `json:"summaryContext,omitempty"` } // summarize dispatches to the appropriate per-type extractor and then diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go new file mode 100644 index 000000000..9a1684885 --- /dev/null +++ b/pkg/resourcecontext/summary.go @@ -0,0 +1,188 @@ +package resourcecontext + +import ( + "strings" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// SummaryOptions configures the per-row enrichment produced by +// BuildSummary. All fields are pre-computed by the caller — this +// package never touches the issue engine, topology builder, or audit +// cache directly. Handlers in internal/* (REST list, MCP list_resources, +// search) walk the per-request topology + issue indexes once and pass +// the per-row digest in here. +type SummaryOptions struct { + // ManagedBy is the compact owner/GitOps pointer attached to the row. + // Callers derive this from topology.Relationships via + // ManagedByFromOwner; nil leaves the field absent. + ManagedBy *ManagedByRef + + // IssueCount is the count of internal issue-engine findings scoped to + // the subject resource. Callers pre-compute a per-namespace index + // (e.g. via internal/issues.ComposeWithStats) once per request and + // pass the count in for each row. Zero omits the field. + IssueCount int + + // Health, when non-empty, overrides the derived health string. The + // default is computed from resource status via deriveHealth — Pod + // container readiness, replica-count workloads, and the standard + // Ready/Available condition on CRDs. Non-trivial kinds derive to "". + Health string +} + +// BuildSummary produces the compact per-row SummaryContext attached to +// list_resources, /api/ai/resources/{kind} list, and search hits. +// +// Tightly bounded — targets ≤ 60 bytes per row when present. Returns +// nil when all three fields would be empty so callers can `omitempty` +// the entire object on bare rows and keep the wire shape minimal. +func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { + health := opts.Health + if health == "" { + health = deriveHealth(obj) + } + if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { + return nil + } + return &SummaryContext{ + ManagedBy: opts.ManagedBy, + Health: health, + IssueCount: opts.IssueCount, + } +} + +// ManagedByFromOwner assembles a compact ManagedByRef from raw owner +// fields (typically pulled out of topology.Relationships in the handler). +// Returns nil when ownerKind or ownerName is empty so callers don't +// have to guard the assignment. +// +// Source classification: +// - "argocd" for argoproj.io kinds (Application, ApplicationSet, Rollout) +// - "flux" for *.fluxcd.io kinds (Kustomization, HelmRelease, GitRepository, …) +// - "native" for everything else (Deployment, StatefulSet, DaemonSet, ReplicaSet, Job, …) +func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) *ManagedByRef { + if ownerKind == "" || ownerName == "" { + return nil + } + return &ManagedByRef{ + Kind: ownerKind, + Source: sourceForOwner(ownerKind, ownerGroup), + Name: ownerName, + Namespace: ownerNamespace, + } +} + +func sourceForOwner(_ string, group string) string { + switch group { + case "argoproj.io": + return "argocd" + } + if strings.HasSuffix(group, ".fluxcd.io") { + return "flux" + } + return "native" +} + +// deriveHealth applies a tiny per-kind heuristic to classify a resource +// as "healthy" | "degraded" | "unhealthy". Kinds we don't recognize +// derive to "" and the field is omitted on the wire. +// +// Vocabulary matches the broader status-tone scheme used across the UI +// (k8s-ui StatusTone) so consumers don't need to translate. +func deriveHealth(obj runtime.Object) string { + if obj == nil { + return "" + } + switch o := obj.(type) { + case *corev1.Pod: + return podHealth(o) + case *appsv1.Deployment: + return replicasHealth(o.Status.ReadyReplicas, o.Status.Replicas) + case *appsv1.StatefulSet: + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *appsv1.DaemonSet: + return replicasHealth(o.Status.NumberReady, o.Status.DesiredNumberScheduled) + case *appsv1.ReplicaSet: + return replicasHealth(o.Status.ReadyReplicas, o.Status.Replicas) + case *unstructured.Unstructured: + return unstructuredHealth(o) + } + return "" +} + +func podHealth(p *corev1.Pod) string { + switch p.Status.Phase { + case corev1.PodRunning: + if len(p.Status.ContainerStatuses) == 0 { + return "degraded" + } + for _, cs := range p.Status.ContainerStatuses { + if !cs.Ready { + return "degraded" + } + } + return "healthy" + case corev1.PodSucceeded: + return "healthy" + case corev1.PodFailed: + return "unhealthy" + case corev1.PodPending: + return "degraded" + } + return "" +} + +func replicasHealth(ready, desired int32) string { + if desired <= 0 { + return "" + } + if ready >= desired { + return "healthy" + } + if ready <= 0 { + return "unhealthy" + } + return "degraded" +} + +// unstructuredHealth derives health for CRDs that follow the standard +// Ready/Available condition pattern. Returns "" for kinds without a +// matching condition so we don't emit a misleading status for resources +// whose status shape we don't understand. +func unstructuredHealth(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + conditions, found, _ := unstructured.NestedSlice(u.Object, "status", "conditions") + if !found || len(conditions) == 0 { + return "" + } + for _, c := range conditions { + cond, ok := c.(map[string]any) + if !ok { + continue + } + condType, _ := cond["type"].(string) + if condType != "Ready" && condType != "Available" { + continue + } + status, _ := cond["status"].(string) + switch status { + case "True": + return "healthy" + case "False": + return "unhealthy" + default: + return "degraded" + } + } + return "" +} diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go new file mode 100644 index 000000000..ce280bb67 --- /dev/null +++ b/pkg/resourcecontext/summary_test.go @@ -0,0 +1,327 @@ +package resourcecontext + +import ( + "encoding/json" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// TestBuildSummary_NilWhenEmpty pins that BuildSummary returns nil when +// every field would be empty — keeps the per-row JSON minimal. +func TestBuildSummary_NilWhenEmpty(t *testing.T) { + // ConfigMap has no health heuristic and no caller-supplied options. + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "x", Namespace: "y"}} + if got := BuildSummary(cm, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(ConfigMap, {}) = %#v, want nil", got) + } +} + +// TestBuildSummary_PodGoldens golden-files BuildSummary across the +// Pod phases that drive the health heuristic. Locks the wire shape +// for the common "list pods" call. +func TestBuildSummary_PodGoldens(t *testing.T) { + cases := []struct { + name string + pod *corev1.Pod + opts SummaryOptions + want string + }{ + { + name: "running_all_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + {Name: "sidecar", Ready: true}, + }, + }, + }, + want: `{"health":"healthy"}`, + }, + { + name: "running_one_not_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: false}, + }, + }, + }, + want: `{"health":"degraded"}`, + }, + { + name: "failed", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "pending", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodPending}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "running_with_issues_and_managedby", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + }, + }, + }, + opts: SummaryOptions{ + ManagedBy: ManagedByFromOwner("ReplicaSet", "apps", "prod", "api-7d5"), + IssueCount: 2, + }, + want: `{"managedBy":{"kind":"ReplicaSet","source":"native","name":"api-7d5","namespace":"prod"},"health":"healthy","issueCount":2}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := BuildSummary(c.pod, c.opts) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, err := json.Marshal(got) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_DeploymentReplicasHealth covers the replica-driven +// health heuristic across the Deployment cases. +func TestBuildSummary_DeploymentReplicasHealth(t *testing.T) { + cases := []struct { + name string + ready int32 + desired int32 + wantSlice []byte // JSON of BuildSummary output + }{ + {"all_ready", 3, 3, []byte(`{"health":"healthy"}`)}, + {"none_ready", 0, 3, []byte(`{"health":"unhealthy"}`)}, + {"partial", 1, 3, []byte(`{"health":"degraded"}`)}, + {"scaled_to_zero", 0, 0, nil}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + dep := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ + ReadyReplicas: c.ready, + Replicas: c.desired, + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if c.wantSlice == nil { + if got != nil { + t.Fatalf("got %#v, want nil", got) + } + return + } + if got == nil { + t.Fatalf("got nil, want %s", c.wantSlice) + } + b, _ := json.Marshal(got) + if string(b) != string(c.wantSlice) { + t.Errorf("got %s\nwant %s", b, c.wantSlice) + } + }) + } +} + +// TestBuildSummary_NetworkPolicy verifies BuildSummary handles a kind +// without a health heuristic — it should only emit fields the caller +// supplied (e.g. issueCount, managedBy) and skip health entirely. +func TestBuildSummary_NetworkPolicy(t *testing.T) { + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "deny-all", Namespace: "prod"}, + } + // Empty opts → nil; the kind has no health heuristic so no field is set. + if got := BuildSummary(np, SummaryOptions{}); got != nil { + t.Fatalf("got %#v, want nil", got) + } + // IssueCount only → summary with just issueCount. + got := BuildSummary(np, SummaryOptions{IssueCount: 3}) + if got == nil { + t.Fatalf("got nil, want summary with issueCount") + } + b, _ := json.Marshal(got) + want := `{"issueCount":3}` + if string(b) != want { + t.Errorf("got %s\nwant %s", b, want) + } +} + +// TestBuildSummary_UnstructuredReadyCondition covers the CRD fallback +// — Ready/Available conditions are translated to the health vocabulary. +func TestBuildSummary_UnstructuredReadyCondition(t *testing.T) { + cases := []struct { + name string + conditions []any + want string + }{ + { + name: "ready_true", + conditions: []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + { + name: "ready_false", + conditions: []any{ + map[string]any{"type": "Ready", "status": "False"}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "ready_unknown", + conditions: []any{ + map[string]any{"type": "Ready", "status": "Unknown"}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "available_true", + conditions: []any{ + map[string]any{"type": "Available", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + u := &unstructured.Unstructured{Object: map[string]any{ + "apiVersion": "example.io/v1", + "kind": "Widget", + "status": map[string]any{"conditions": c.conditions}, + }} + got := BuildSummary(u, SummaryOptions{}) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, _ := json.Marshal(got) + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_HealthOverride pins that caller-supplied Health +// short-circuits the per-kind heuristic. +func TestBuildSummary_HealthOverride(t *testing.T) { + dep := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ReadyReplicas: 3, Replicas: 3}, + } + got := BuildSummary(dep, SummaryOptions{Health: "degraded"}) + if got == nil || got.Health != "degraded" { + t.Fatalf("Health override ignored: %#v", got) + } +} + +// TestManagedByFromOwner pins source classification for each cluster +// of owner kinds we care about. +func TestManagedByFromOwner(t *testing.T) { + cases := []struct { + name string + kind string + group string + namespace string + ownerName string + want *ManagedByRef + }{ + { + name: "empty_kind", + kind: "", + ownerName: "x", + want: nil, + }, + { + name: "empty_name", + kind: "Deployment", + ownerName: "", + want: nil, + }, + { + name: "deployment", + kind: "Deployment", + group: "apps", + namespace: "prod", + ownerName: "api", + want: &ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + }, + { + name: "argocd_application", + kind: "Application", + group: "argoproj.io", + namespace: "argocd", + ownerName: "storefront", + want: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, + }, + { + name: "flux_kustomization", + kind: "Kustomization", + group: "kustomize.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_helmrelease", + kind: "HelmRelease", + group: "helm.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_gitrepository", + kind: "GitRepository", + group: "source.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "repo", + want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := ManagedByFromOwner(c.kind, c.group, c.namespace, c.ownerName) + if !reflect.DeepEqual(got, c.want) { + t.Errorf("ManagedByFromOwner(%q, %q, %q, %q) = %#v, want %#v", + c.kind, c.group, c.namespace, c.ownerName, got, c.want) + } + }) + } +} + +// TestBuildSummary_NilObject defends against the typed-nil-in-interface +// trap: handlers occasionally pass interface-wrapped nils. +func TestBuildSummary_NilObject(t *testing.T) { + var obj runtime.Object + if got := BuildSummary(obj, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(nil) = %#v, want nil", got) + } + // IssueCount alone still produces output (no panic via nil obj). + got := BuildSummary(obj, SummaryOptions{IssueCount: 1}) + if got == nil || got.IssueCount != 1 { + t.Fatalf("BuildSummary(nil, IssueCount=1) = %#v, want issueCount=1", got) + } +} From 8339c67a3414cf5bc1f69b2adc6480a09127c5e2 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:07:25 +0300 Subject: [PATCH 09/33] feat(resourcecontext): consolidate summary builder on T23 (ManagedBy + shared index) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that T23 (#720) merged the server-synthesized Relationships.ManagedBy chain and the RelationshipsIndex inverted-edges lookup, the per-row SummaryContext builder no longer needs to re-detect ownership and no longer pays O(E) per call. - Read rel.ManagedBy[0] for the ManagedByRef (ArgoCD > Flux > Helm > topmost K8s owner). Falls back to rel.Owner when synthesis declines. Drops the local Deployment-grandparent shortcut — ManagedBy walks past the noisy hash-suffixed ReplicaSet for us. - Build topology.IndexByResource(topo) once per request and thread it into GetRelationshipsWithObject on every row. Without the index, the list_resources + search hot paths were O(N x E). - Use GetRelationshipsWithObject (not -WithIndex) so synthesis is group-aware when the row has the typed/unstructured object handy — avoids kind/plural collisions like Knative Service vs corev1 Service. - Update server tests to pin the ManagedBy preference and the new Owner fallback shape. --- internal/mcp/summary_context.go | 43 +++++++++++++++------- internal/server/summary_context.go | 49 +++++++++++++++++-------- internal/server/summary_context_test.go | 36 ++++++++++++++---- 3 files changed, 93 insertions(+), 35 deletions(-) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 43cfd7108..cd10286f7 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -42,10 +42,29 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + // One inverted-edges index per request — without it each + // GetRelationships call would re-scan topo.Edges in O(E), turning + // the list/search hot path into O(N × E). See pkg/topology T3. + var relIdx *topology.RelationshipsIndex + if topo != nil { + relIdx = topology.IndexByResource(topo) + } + return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { var managedBy *resourcecontext.ManagedByRef if topo != nil { - rel := topology.GetRelationships(kind, namespace, name, topo, resourceProvider, dynamicProvider) + // Pass the fetched object when available so synthesis is + // group-aware (avoids kind/plural collisions like Knative + // Service vs corev1 Service). Falls back to (kind, ns, name) + // lookup when neither obj nor u is set. + var rawObj any + switch { + case obj != nil: + rawObj = obj + case u != nil: + rawObj = u + } + rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) managedBy = managedByFromRelationships(rel) } var source runtime.Object = obj @@ -158,22 +177,20 @@ func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) } // managedByFromRelationships extracts a compact ManagedByRef from -// computed topology relationships. Preference: Deployment grandparent -// shortcut (Pods owned by ReplicaSets surface the controlling -// Deployment, not the noisy hash-suffixed RS), then direct Owner. +// computed topology relationships. Preference: server-synthesized +// Relationships.ManagedBy (ArgoCD > Flux > Helm > topmost K8s owner), +// then direct Owner as fallback when synthesis declines. func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { if rel == nil { return nil } - var ref *topology.ResourceRef - switch { - case rel.Deployment != nil: - ref = rel.Deployment - case rel.Owner != nil: - ref = rel.Owner - default: - return nil + if len(rel.ManagedBy) > 0 { + ref := rel.ManagedBy[0] + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + } + if rel.Owner != nil { + return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) } - return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + return nil } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index 08e090ad6..d9b3140da 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -56,10 +56,29 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + // One inverted-edges index per request — without it each + // GetRelationships call would re-scan topo.Edges in O(E), turning + // the list/search hot path into O(N × E). See pkg/topology T3. + var relIdx *topology.RelationshipsIndex + if topo != nil { + relIdx = topology.IndexByResource(topo) + } + return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { var managedBy *resourcecontext.ManagedByRef if topo != nil { - rel := topology.GetRelationships(kind, namespace, name, topo, resourceProvider, dynamicProvider) + // Pass the fetched object when available so synthesis is + // group-aware (avoids kind/plural collisions like Knative + // Service vs corev1 Service). Falls back to (kind, ns, name) + // lookup when neither obj nor u is set. + var rawObj any + switch { + case obj != nil: + rawObj = obj + case u != nil: + rawObj = u + } + rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) managedBy = managedByFromRelationships(rel) } var source runtime.Object = obj @@ -156,25 +175,25 @@ func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) // managedByFromRelationships extracts a compact ManagedByRef from // computed topology relationships. Preference order: -// 1. Deployment grandparent shortcut (Pods owned by ReplicaSets surface -// the controlling Deployment, not the noisy hash-suffixed RS). -// 2. Direct Owner — covers everything else (StatefulSet pod → STS, -// Job pod → Job, ArgoCD Application children, Flux Kustomization -// children, etc.). +// 1. Relationships.ManagedBy[0] — the server-synthesized topmost +// manager (ArgoCD Application > Flux Kustomization/HelmRelease > +// Helm release > topmost K8s owner). Walks the owner chain past +// ReplicaSets to the controlling Deployment in one shot. +// 2. Direct Owner — fallback for shapes ManagedBy synthesis declines +// (e.g. cluster-scoped roots where the topmost manager is the +// resource itself). // // Returns nil when topology has no relationship for the resource. func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { if rel == nil { return nil } - var ref *topology.ResourceRef - switch { - case rel.Deployment != nil: - ref = rel.Deployment - case rel.Owner != nil: - ref = rel.Owner - default: - return nil + if len(rel.ManagedBy) > 0 { + ref := rel.ManagedBy[0] + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + } + if rel.Owner != nil { + return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) } - return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + return nil } diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 417166c90..27e7bdd37 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -147,12 +147,16 @@ func TestAttachSummaryContextToUnstructuredList(t *testing.T) { } } -// TestManagedByFromRelationships_PrefersDeployment pins the Pod → -// Deployment grandparent shortcut over the noisier ReplicaSet owner. -func TestManagedByFromRelationships_PrefersDeployment(t *testing.T) { +// TestManagedByFromRelationships_PrefersManagedBy pins the topmost-manager +// shortcut: when topology has synthesized a ManagedBy chain (Pod → +// ReplicaSet → Deployment), the helper surfaces the Deployment, not the +// noisy hash-suffixed ReplicaSet that sits in Owner. +func TestManagedByFromRelationships_PrefersManagedBy(t *testing.T) { rel := &topology.Relationships{ - Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, - Deployment: &topology.ResourceRef{Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + }, } got := managedByFromRelationships(rel) want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} @@ -161,8 +165,9 @@ func TestManagedByFromRelationships_PrefersDeployment(t *testing.T) { } } -// TestManagedByFromRelationships_FallsBackToOwner covers the -// non-Pod case (StatefulSet → STS, Job pod → Job, etc.). +// TestManagedByFromRelationships_FallsBackToOwner covers the case where +// topology synthesis declined ManagedBy (e.g. cluster-scoped roots) — +// we still surface the direct Owner so the row isn't context-less. func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { rel := &topology.Relationships{ Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, @@ -176,6 +181,23 @@ func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { } } +// TestManagedByFromRelationships_ManagedByWinsOverOwner pins that when +// both ManagedBy and Owner are set, ManagedBy[0] takes precedence — the +// server-synthesized topmost-manager walk should never be shadowed by +// the direct owner ref left over for back-compat. +func TestManagedByFromRelationships_ManagedByWinsOverOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + }, + } + got := managedByFromRelationships(rel) + if got == nil || got.Kind != "Application" || got.Source != "argocd" { + t.Errorf("got %#v, want Application/argocd", got) + } +} + func TestManagedByFromRelationships_NilSafe(t *testing.T) { if got := managedByFromRelationships(nil); got != nil { t.Errorf("nil rel: got %#v, want nil", got) From 8309c105a6185f599073d8ae223f062853e5f098 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:42:53 +0300 Subject: [PATCH 10/33] fix(summaryContext): group-aware issue index, uncapped count, native Helm classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three reviewer findings on PR #722: 1. issueIndexKey was group-blind: two CRDs sharing kind+ns+name in different API groups (Knative Service vs corev1 Service, two operators each shipping a "Cluster" CRD) collided on the same bucket and inherited each other's count. Threaded group through the builder + key — format is now "group|kind|ns|name", "|" is illegal in K8s API groups so it's a safe delimiter. 2. buildIssueIndex used Limit:MaxLimit (1000), so on >1000-issue clusters resources whose issues fell in the post-sort tail silently got issueCount=0. Picked Option B: added an explicit NoLimit sentinel in internal/issues. Fewer files touched than Option A (a separate CountByResource API), and the index is the only caller that wants the uncapped set — public /api/issues + issues_list keep their tight caps. ComposeStats.TotalMatched already reported the true total regardless of truncation. 3. sourceForOwner returned "native" for {Kind:"HelmRelease", Group:""} — but topology.detectManagedByFromMeta emits exactly that shape for native Helm installs (distinguished from Flux's HelmRelease CR which lives at helm.toolkit.fluxcd.io). Added a "helm" branch ahead of the group-based GitOps switch. Tests: - pkg/resourcecontext: native Helm classification. - internal/server + internal/mcp: group-aware index (two CRDs same kind+ns+name), tail-of-MaxLimit overflow. Signature change: summaryContextBuilder and search.SummaryBuilderFunc now take `group` as a parameter. All callers updated to source group from typed object GVK (SetTypeMeta path) or unstructured apiVersion. --- internal/issues/issues.go | 11 +- internal/issues/types.go | 7 ++ internal/mcp/summary_context.go | 41 +++++--- internal/mcp/summary_context_test.go | 102 ++++++++++++++++++ internal/mcp/tools.go | 34 +++++- internal/search/search.go | 10 +- internal/search/summary_context_test.go | 9 +- internal/server/ai_handlers.go | 38 ++++++- internal/server/summary_context.go | 42 +++++--- internal/server/summary_context_test.go | 131 ++++++++++++++++++++++-- pkg/resourcecontext/summary.go | 13 ++- pkg/resourcecontext/summary_test.go | 15 +++ 12 files changed, 411 insertions(+), 42 deletions(-) create mode 100644 internal/mcp/summary_context_test.go diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 2ed01b2bb..3ca644a24 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -74,10 +74,17 @@ func Compose(p Provider, f Filters) []Issue { // severity desc, then last-seen desc, then kind/ns/name for stable // tiebreaks. func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { + // Negative Limit is the "uncapped" sentinel: callers that need the + // full matched set (per-resource issue indexes for /api/ai list + + // search summaryContext) pass NoLimit so a 5000-issue cluster + // doesn't silently drop counts for resources whose issues fall in + // the tail beyond MaxLimit. Zero still maps to DefaultLimit so the + // public /api/issues + MCP issues_list keep their tight caps. + uncapped := f.Limit < 0 if f.Limit == 0 { f.Limit = DefaultLimit } - if f.Limit > MaxLimit { + if !uncapped && f.Limit > MaxLimit { f.Limit = MaxLimit } @@ -201,7 +208,7 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { return out[i].Name < out[j].Name }) stats.TotalMatched = len(out) - if len(out) > f.Limit { + if !uncapped && len(out) > f.Limit { out = out[:f.Limit] } return out, stats diff --git a/internal/issues/types.go b/internal/issues/types.go index 11b368433..63b22a38c 100644 --- a/internal/issues/types.go +++ b/internal/issues/types.go @@ -123,4 +123,11 @@ type Filters struct { const ( DefaultLimit = 200 MaxLimit = 1000 + // NoLimit disables the result cap. Pass as Filters.Limit when the + // caller needs the full matched set (e.g. building a per-resource + // issue index for summaryContext — capping there would silently zero + // out counts for resources whose issues fall in the tail beyond + // MaxLimit on large clusters). Stats.TotalMatched is reliable + // regardless; this just turns off the post-sort slice. + NoLimit = -1 ) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index cd10286f7..364f1280e 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -20,7 +20,12 @@ import ( // summaryContextBuilder is the per-request closure that produces a // SummaryContext for a single resource. nil result is fine — the // SummaryContext field is omitempty on every consumer. -type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext +// +// group is required so the per-resource issue lookup can distinguish +// CRDs that share kind+namespace+name across API groups (e.g. Knative +// Service vs corev1 Service, or two custom CRDs both named "Cluster" +// from different operators). Pass "" for core-group resources. +type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext // newSummaryContextBuilder assembles the per-request closure for MCP // list_resources / search. Returns nil when the cache or topology @@ -50,7 +55,7 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon relIdx = topology.IndexByResource(topo) } - return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { var managedBy *resourcecontext.ManagedByRef if topo != nil { // Pass the fetched object when available so synthesis is @@ -73,7 +78,7 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon } return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ ManagedBy: managedBy, - IssueCount: idx.count(kind, namespace, name), + IssueCount: idx.count(group, kind, namespace, name), }) } } @@ -101,18 +106,23 @@ func buildSummaryContextTopology(namespaces []string) *topology.Topology { return topo } -// issueIndex keys per-resource issue counts as "kind|namespace|name". -// Kind is canonicalized via canonicalSingular because issue sources emit -// the kind as-typed (Deployment) while callers may pass the URL plural -// (deployments) — canonicalization normalizes both. +// issueIndex keys per-resource issue counts as "group|kind|namespace|name". +// Group goes FIRST so two CRDs sharing kind+namespace+name across API +// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, +// or two operators each shipping a "Cluster" CRD) get independent counts +// instead of inheriting each other's. Kind is canonicalized via +// canonicalSingular because issue sources emit the kind as-typed +// (Deployment) while callers may pass the URL plural (deployments) — +// canonicalization normalizes both. "|" can't appear in a Kubernetes API +// group (groups follow DNS subdomain rules), so it's a safe delimiter. type issueIndex map[string]int -func (i issueIndex) count(kind, namespace, name string) int { - return i[issueIndexKey(kind, namespace, name)] +func (i issueIndex) count(group, kind, namespace, name string) int { + return i[issueIndexKey(group, kind, namespace, name)] } -func issueIndexKey(kind, namespace, name string) string { - return strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name +func issueIndexKey(group, kind, namespace, name string) string { + return group + "|" + strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name } func canonicalSingular(kind string) string { @@ -161,9 +171,14 @@ func canonicalSingular(kind string) string { } func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { + // NoLimit (not MaxLimit) is required here: a 5000-issue cluster would + // otherwise truncate after the first 1000 sorted rows, silently + // zeroing issueCount for resources whose issues fall in the tail. + // We're bucketing for a per-resource lookup, not paginating — the + // caller of summaryContext never sees the issue list itself. filters := issues.Filters{ Namespaces: namespaces, - Limit: issues.MaxLimit, + Limit: issues.NoLimit, } if kindFilter != "" { filters.Kinds = []string{canonicalSingular(kindFilter)} @@ -171,7 +186,7 @@ func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) composed := issues.Compose(p, filters) idx := make(issueIndex, len(composed)) for _, iss := range composed { - idx[issueIndexKey(iss.Kind, iss.Namespace, iss.Name)]++ + idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ } return idx } diff --git a/internal/mcp/summary_context_test.go b/internal/mcp/summary_context_test.go new file mode 100644 index 000000000..9289abb6c --- /dev/null +++ b/internal/mcp/summary_context_test.go @@ -0,0 +1,102 @@ +// Mirror of internal/server/summary_context_test.go for the MCP path — +// pins the group-aware issue index key and the NoLimit fix so the MCP +// list_resources / search builders stay in lockstep with REST. + +package mcp + +import ( + "fmt" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bp "github.com/skyhook-io/radar/pkg/audit" +) + +// fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex +// tests. Only the fields the index path touches are wired. +type fakeIssuesProvider struct { + problems []k8s.Problem +} + +func (f *fakeIssuesProvider) DetectProblems(_ []string) []k8s.Problem { return f.problems } +func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } +func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } +func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { + return nil +} +func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } +func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { + return nil, nil +} +func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } + +func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } + +// TestIssueIndexKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups get independent +// counts. The MCP layer mirrors the REST layer's index — same hazard, +// same fix. +func TestIssueIndexKey_GroupAware(t *testing.T) { + idx := issueIndex{} + idx[issueIndexKey("", "Service", "prod", "api")] = 2 + idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 + + if got := idx.count("", "Service", "prod", "api"); got != 2 { + t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) + } + if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 5 { + t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) + } + if got := idx.count("example.io", "Service", "prod", "api"); got != 0 { + t.Errorf("unknown-group lookup = %d, want 0", got) + } +} + +// TestBuildIssueIndex_GroupAware exercises the full buildIssueIndex +// path with two CRDs that share kind+namespace+name across groups. +func TestBuildIssueIndex_GroupAware(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, + }, + } + idx := buildIssueIndex(p, nil, "") + if got := idx.count("", "Service", "prod", "api"); got != 1 { + t.Errorf("core Service count = %d, want 1", got) + } + if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 2 { + t.Errorf("Knative Service count = %d, want 2", got) + } +} + +// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues +// would fall in the tail beyond MaxLimit still get correct issueCounts. +// Pre-fix, buildIssueIndex passed Limit:MaxLimit (1000) to Compose; on +// a cluster with >1000 issues the post-sort truncation silently zeroed +// tail counts. NoLimit removes the cap because the index is a per- +// resource bucket count, not a paginated list. +func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { + probs := make([]k8s.Problem, 0, issues.MaxLimit+50) + for i := 0; i < issues.MaxLimit+50; i++ { + probs = append(probs, k8s.Problem{ + Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", + }) + } + p := &fakeIssuesProvider{problems: probs} + idx := buildIssueIndex(p, nil, "") + tailName := fmtPodName(issues.MaxLimit + 25) + if got := idx.count("", "Pod", "prod", tailName); got != 1 { + t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) + } + if got := idx.count("", "Pod", "prod", fmtPodName(0)); got != 1 { + t.Errorf("head pod count = %d, want 1", got) + } +} diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 64a5ae6cb..e13ea2a8e 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -512,6 +512,10 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li // Summary-verbosity ResourceSummary in-place. results and objs are // produced in lockstep by MinifyList — a length mismatch is defensive // (skip rather than panic). +// +// Group is sourced per-object from each typed object's GVK (SetTypeMeta +// is called by Minify, so apiVersion is reliable here) — passed through +// to the builder so the per-resource issue lookup stays group-aware. func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder summaryContextBuilder) { if len(results) != len(objs) { return @@ -521,7 +525,8 @@ func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder s if !ok || summary == nil { continue } - summary.SummaryContext = builder(objs[i], nil, summary.Kind, summary.Namespace, summary.Name) + group := groupFromObject(objs[i]) + summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) } } @@ -560,6 +565,10 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g // attachSummaryContextToUnstructured fills in SummaryContext for the // dynamic-CRD list path. summarizeUnstructured returns // *aicontext.ResourceSummary, so the cast matches the typed path. +// +// Group comes from each unstructured's apiVersion so two CRDs that share +// kind+ns+name across API groups (e.g. multiple operators each shipping +// a "Cluster" resource) get independent issue counts. func attachSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { if len(results) != len(items) { return @@ -569,8 +578,29 @@ func attachSummaryContextToUnstructured(results []any, items []*unstructured.Uns if !ok || summary == nil { continue } - summary.SummaryContext = builder(nil, items[i], summary.Kind, summary.Namespace, summary.Name) + group := groupFromUnstructured(items[i]) + summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// groupFromObject extracts the API group from a typed runtime.Object's +// GroupVersionKind. Returns "" for core-group objects (Pod, Service, +// etc.) and when the GVK is unset. +func groupFromObject(obj runtime.Object) string { + if obj == nil { + return "" + } + k8s.SetTypeMeta(obj) + return obj.GetObjectKind().GroupVersionKind().Group +} + +// groupFromUnstructured pulls the API group from an unstructured's +// apiVersion. Mirrors groupFromObject for the dynamic-CRD path. +func groupFromUnstructured(u *unstructured.Unstructured) string { + if u == nil { + return "" } + return u.GroupVersionKind().Group } func handleGetResource(ctx context.Context, req *mcp.CallToolRequest, input getResourceInput) (*mcp.CallToolResult, any, error) { diff --git a/internal/search/search.go b/internal/search/search.go index 575970d6f..0cd9bf624 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -31,7 +31,13 @@ import ( // non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil // is fine (the field is omitempty); callers use it to gate context // emission per request (context=none opts out by passing nil here). -type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext +// +// group is the candidate's API group (already known to the search +// walker — typed kinds via typedKinds, CRDs via gvr.Group). Threading +// it through lets the builder distinguish CRDs that share +// kind+namespace+name across groups (e.g. Knative Service vs corev1 +// Service) in its per-resource issue index. +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext // Provider abstracts the cache so tests can inject a fake. type Provider interface { @@ -396,7 +402,7 @@ func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, } } if summaryBuilder != nil { - h.SummaryContext = summaryBuilder(obj, u, c.Kind, c.Namespace, c.Name) + h.SummaryContext = summaryBuilder(obj, u, c.Group, c.Kind, c.Namespace, c.Name) } return h } diff --git a/internal/search/summary_context_test.go b/internal/search/summary_context_test.go index 4a9510e48..0c54e0d72 100644 --- a/internal/search/summary_context_test.go +++ b/internal/search/summary_context_test.go @@ -31,8 +31,10 @@ func TestSearch_SummaryBuilderAttached(t *testing.T) { } var calls int - builder := func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + var gotGroup string + builder := func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { calls++ + gotGroup = group return &resourcecontext.SummaryContext{ ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: namespace}, Health: "healthy", @@ -60,6 +62,11 @@ func TestSearch_SummaryBuilderAttached(t *testing.T) { if h.SummaryContext.ManagedBy == nil || h.SummaryContext.ManagedBy.Name != "api" { t.Errorf("ManagedBy mismatch: %+v", h.SummaryContext.ManagedBy) } + // Pod is core-group — builder should see "" for group, threaded + // through from candidate.Group (set on the typed walker via tk.Group). + if gotGroup != "" { + t.Errorf("builder saw group=%q for core-group Pod, want \"\"", gotGroup) + } } // TestSearch_NoSummaryBuilder_LeavesNilContext is the opt-out path diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 8349577c8..7c6c23e34 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -80,6 +80,9 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // per-row attachment is specifically for cheap list triage. if !skipContext && level == aicontext.LevelSummary { if builder := s.newSummaryContextBuilder(namespaces, kind); builder != nil { + // Typed list resolves group from each object's TypeMeta — + // MinifyList sets it via SetTypeMeta before producing rows, + // so we can trust apiVersion on the typed source. attachSummaryContextToList(results, objs, builder) } } @@ -92,6 +95,10 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // objs are produced in lockstep by MinifyList; a length mismatch is // defensive (and silently skips attachment rather than panicking) but // shouldn't occur in practice. +// +// Group is sourced per-object from the typed object's GVK (via SetTypeMeta +// + ObjectKind), so list paths that mix kinds — they don't today, but the +// shape doesn't preclude it — stay correct. func attachSummaryContextToList(results []any, objs []runtime.Object, builder summaryContextBuilder) { if len(results) != len(objs) { return @@ -101,13 +108,18 @@ func attachSummaryContextToList(results []any, objs []runtime.Object, builder su if !ok || summary == nil { continue } - summary.SummaryContext = builder(objs[i], nil, summary.Kind, summary.Namespace, summary.Name) + group := groupFromObject(objs[i]) + summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) } } // attachSummaryContextToUnstructuredList does the same for the dynamic // CRD path. MinifyUnstructured returns *ResourceSummary (Summary level) // so the cast is the same shape. +// +// Group comes from each unstructured's apiVersion — required for issue- +// index lookups so two CRDs that share kind+ns+name across groups don't +// collide on the per-resource count. func attachSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { if len(results) != len(items) { return @@ -117,8 +129,30 @@ func attachSummaryContextToUnstructuredList(results []any, items []*unstructured if !ok || summary == nil { continue } - summary.SummaryContext = builder(nil, items[i], summary.Kind, summary.Namespace, summary.Name) + group := groupFromUnstructured(items[i]) + summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// groupFromObject extracts the API group from a typed runtime.Object's +// GroupVersionKind. Returns "" for core-group objects and when the GVK +// is unset (callers should SetTypeMeta first, but we don't panic on +// the missing case). +func groupFromObject(obj runtime.Object) string { + if obj == nil { + return "" + } + k8s.SetTypeMeta(obj) + return obj.GetObjectKind().GroupVersionKind().Group +} + +// groupFromUnstructured pulls the API group from an unstructured's +// apiVersion. Mirrors groupFromObject for the dynamic-CRD path. +func groupFromUnstructured(u *unstructured.Unstructured) string { + if u == nil { + return "" } + return u.GroupVersionKind().Group } // aiListDynamic handles the CRD/dynamic fallback for AI list. diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index d9b3140da..80bc9201d 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -27,7 +27,12 @@ import ( // summaryContextBuilder is the per-request closure that produces a // SummaryContext for a single resource. nil result is fine — the // SummaryContext field is omitempty on every consumer. -type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext +// +// group is required so the per-resource issue lookup can distinguish +// CRDs that share kind+namespace+name across API groups (e.g. Knative +// Service vs corev1 Service, or two custom CRDs both named "Cluster" +// from different operators). Pass "" for core-group resources. +type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext // newSummaryContextBuilder assembles the per-request closure for the // list/search handlers. Returns nil when the cache or topology isn't @@ -64,7 +69,7 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string relIdx = topology.IndexByResource(topo) } - return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { var managedBy *resourcecontext.ManagedByRef if topo != nil { // Pass the fetched object when available so synthesis is @@ -87,23 +92,29 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string } return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ ManagedBy: managedBy, - IssueCount: idx.count(kind, namespace, name), + IssueCount: idx.count(group, kind, namespace, name), }) } } -// issueIndex keys per-resource issue counts as "kind|namespace|name". -// Kind is canonicalized via strings.ToLower because issue sources emit -// the kind as-typed (Deployment) while callers may pass the URL plural -// (deployments) — lowercase normalizes both. +// issueIndex keys per-resource issue counts as "group|kind|namespace|name". +// Group goes FIRST so two CRDs sharing kind+namespace+name across API +// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, +// or two operators each shipping a "Cluster" CRD) get independent counts +// instead of inheriting each other's. Kind is canonicalized via +// strings.ToLower because issue sources emit the kind as-typed +// (Deployment) while callers may pass the URL plural (deployments) — +// lowercase normalizes both. "|" can't appear in a Kubernetes API group +// (groups follow DNS subdomain rules: lowercase alphanumerics, "-", +// and "."), so it's a safe delimiter. type issueIndex map[string]int -func (i issueIndex) count(kind, namespace, name string) int { - return i[issueIndexKey(kind, namespace, name)] +func (i issueIndex) count(group, kind, namespace, name string) int { + return i[issueIndexKey(group, kind, namespace, name)] } -func issueIndexKey(kind, namespace, name string) string { - return strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name +func issueIndexKey(group, kind, namespace, name string) string { + return group + "|" + strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name } // canonicalSingular collapses common plural forms back to the singular @@ -155,9 +166,14 @@ func canonicalSingular(kind string) string { } func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { + // NoLimit (not MaxLimit) is required here: a 5000-issue cluster would + // otherwise truncate after the first 1000 sorted rows, silently + // zeroing issueCount for resources whose issues fall in the tail. + // We're bucketing for a per-resource lookup, not paginating — the + // caller of summaryContext never sees the issue list itself. filters := issues.Filters{ Namespaces: namespaces, - Limit: issues.MaxLimit, + Limit: issues.NoLimit, } if kindFilter != "" { // Compose's Kinds filter expects the singular kind ("Pod"). The @@ -168,7 +184,7 @@ func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) composed := issues.Compose(p, filters) idx := make(issueIndex, len(composed)) for _, iss := range composed { - idx[issueIndexKey(iss.Kind, iss.Namespace, iss.Name)]++ + idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ } return idx } diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 27e7bdd37..a40477333 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -2,25 +2,64 @@ package server import ( "encoding/json" + "fmt" "testing" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + bp "github.com/skyhook-io/radar/pkg/audit" "github.com/skyhook-io/radar/pkg/resourcecontext" "github.com/skyhook-io/radar/pkg/topology" ) +// k8sProblem is the test-side alias kept short so generated rows +// don't have to repeat the package qualifier. +type k8sProblem = k8s.Problem + +// issuesMaxLimit mirrors internal/issues.MaxLimit at test scope so the +// MaxLimit-overflow assertion doesn't depend on test order against the +// importing package's constant. +var issuesMaxLimit = issues.MaxLimit + +// fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex +// tests. Only the fields the index path touches are wired; the CRD- +// condition fallback path is exercised by issues' own tests. +type fakeIssuesProvider struct { + problems []k8s.Problem +} + +func (f *fakeIssuesProvider) DetectProblems(_ []string) []k8s.Problem { return f.problems } +func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } +func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } +func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { + return nil +} +func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } +func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { + return nil, nil +} +func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } + +func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } + // stubBuilder records calls and returns a deterministic SummaryContext // keyed by the resource identity. Avoids standing up a topology cache or // issue provider — those are exercised by the per-layer unit tests. +// +// Key shape mirrors the production issueIndexKey (group|kind|ns|name) +// so test fixtures pin the group-aware lookup. func stubBuilder(t *testing.T, want map[string]*resourcecontext.SummaryContext) summaryContextBuilder { t.Helper() - return func(obj runtime.Object, u *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { - key := kind + "|" + namespace + "|" + name + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { + key := group + "|" + kind + "|" + namespace + "|" + name return want[key] } } @@ -39,13 +78,14 @@ func TestAttachSummaryContextToList(t *testing.T) { Status: corev1.PodStatus{Phase: corev1.PodFailed}, }, } + // Group is "" for core-group Pods. want := map[string]*resourcecontext.SummaryContext{ - "Pod|prod|api-1": { + "|Pod|prod|api-1": { ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, Health: "healthy", IssueCount: 0, }, - "Pod|prod|api-2": { + "|Pod|prod|api-2": { ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, Health: "unhealthy", IssueCount: 3, @@ -96,7 +136,7 @@ func TestAttachSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, } // Length mismatch (1 obj vs 2 results) — must not panic, must skip. - attachSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, kind, namespace, name string) *resourcecontext.SummaryContext { + attachSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { return &resourcecontext.SummaryContext{Health: "healthy"} }) for i, row := range results { @@ -123,7 +163,7 @@ func TestAttachSummaryContextToUnstructuredList(t *testing.T) { }}, } want := map[string]*resourcecontext.SummaryContext{ - "Application|argocd|storefront": { + "argoproj.io|Application|argocd|storefront": { Health: "healthy", IssueCount: 1, }, @@ -207,6 +247,85 @@ func TestManagedByFromRelationships_NilSafe(t *testing.T) { } } +// TestIssueIndexKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups get independent +// counts. Without group in the key, e.g. Knative serving.knative.dev/ +// Service vs corev1 ""/Service collapse onto one bucket — and either +// the CRD inherits the core Service's count or vice versa. This breaks +// the moment a user has two operators each shipping a kind named +// "Cluster" in the same namespace. +func TestIssueIndexKey_GroupAware(t *testing.T) { + idx := issueIndex{} + // Same kind+ns+name, different groups — must be independent buckets. + idx[issueIndexKey("", "Service", "prod", "api")] = 2 + idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 + + if got := idx.count("", "Service", "prod", "api"); got != 2 { + t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) + } + if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 5 { + t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) + } + // Wrong group lookup is a miss, not a fallback. + if got := idx.count("example.io", "Service", "prod", "api"); got != 0 { + t.Errorf("unknown-group lookup = %d, want 0 (key should not coalesce across groups)", got) + } +} + +// TestBuildIssueIndex_GroupAware exercises the full buildIssueIndex +// path with two CRDs that share kind+namespace+name but live in +// different API groups. Pre-fix, both rows landed under the same +// "service|prod|api" key and one inherited the other's count. +func TestBuildIssueIndex_GroupAware(t *testing.T) { + // Inject via a fake issues.Provider rather than the cache plumbing — + // keeps the test focused on the index-key arithmetic. + p := &fakeIssuesProvider{ + problems: []k8sProblem{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, + }, + } + idx := buildIssueIndex(p, nil, "") + if got := idx.count("", "Service", "prod", "api"); got != 1 { + t.Errorf("core Service count = %d, want 1", got) + } + if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 2 { + t.Errorf("Knative Service count = %d, want 2", got) + } +} + +// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues +// would fall in the tail beyond MaxLimit still get correct issueCounts. +// Pre-fix, buildIssueIndex passed Limit:MaxLimit (1000) to Compose; on +// a cluster with >1000 issues the post-sort truncation silently zeroed +// out counts for tail resources. The fix is Limit:NoLimit — the index +// is a bucketed count, not a paginated list. +func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { + // Generate MaxLimit+50 problem rows across distinct resources so + // every bucket has exactly one issue. Without the NoLimit fix, the + // last 50 resources' counts collapse to 0. + probs := make([]k8sProblem, 0, issuesMaxLimit+50) + for i := 0; i < issuesMaxLimit+50; i++ { + probs = append(probs, k8sProblem{ + Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", + }) + } + p := &fakeIssuesProvider{problems: probs} + idx := buildIssueIndex(p, nil, "") + // Spot-check a tail resource — anything beyond MaxLimit must still + // resolve to count=1, not 0. + tailName := fmtPodName(issuesMaxLimit + 25) + if got := idx.count("", "Pod", "prod", tailName); got != 1 { + t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) + } + // And the first resource sees its count too — sanity that the + // truncation didn't shift in the other direction. + if got := idx.count("", "Pod", "prod", fmtPodName(0)); got != 1 { + t.Errorf("head pod count = %d, want 1", got) + } +} + // TestCanonicalSingular pins the kind normalization used to align URL // plurals with the singular form the issue engine emits. func TestCanonicalSingular(t *testing.T) { diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go index 9a1684885..cc1190a20 100644 --- a/pkg/resourcecontext/summary.go +++ b/pkg/resourcecontext/summary.go @@ -63,6 +63,9 @@ func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { // Source classification: // - "argocd" for argoproj.io kinds (Application, ApplicationSet, Rollout) // - "flux" for *.fluxcd.io kinds (Kustomization, HelmRelease, GitRepository, …) +// - "helm" for the native Helm release pseudo-owner (kind "HelmRelease" +// with no group — emitted by topology's detectManagedByFromMeta to +// distinguish from Flux's HelmRelease CR in helm.toolkit.fluxcd.io) // - "native" for everything else (Deployment, StatefulSet, DaemonSet, ReplicaSet, Job, …) func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) *ManagedByRef { if ownerKind == "" || ownerName == "" { @@ -76,7 +79,15 @@ func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) } } -func sourceForOwner(_ string, group string) string { +func sourceForOwner(ownerKind, group string) string { + // Native Helm install: topology synthesizes a {Kind:"HelmRelease", Group:""} + // pseudo-owner from Helm's release-name/namespace annotations. This must + // be classified BEFORE the group-based GitOps branches so we don't fall + // through to "native" — Flux's HelmRelease lives at helm.toolkit.fluxcd.io + // and is handled by the *.fluxcd.io branch below. + if ownerKind == "HelmRelease" && group == "" { + return "helm" + } switch group { case "argoproj.io": return "argocd" diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go index ce280bb67..07836ad7b 100644 --- a/pkg/resourcecontext/summary_test.go +++ b/pkg/resourcecontext/summary_test.go @@ -300,6 +300,21 @@ func TestManagedByFromOwner(t *testing.T) { ownerName: "repo", want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, }, + { + // Native Helm release: topology's detectManagedByFromMeta emits + // {Kind:"HelmRelease", Group:""} when it sees Helm's release-name + // annotation (no Flux/GitOps signal). Must classify as "helm", + // not "native" — distinguishes Helm-managed resources in the + // list/search UI from raw kubectl-applied ones. The Flux + // HelmRelease CR lives at helm.toolkit.fluxcd.io and is covered + // by the case above; the empty-group form is unambiguous. + name: "native_helm_release", + kind: "HelmRelease", + group: "", + namespace: "cert-manager", + ownerName: "cert-manager", + want: &ManagedByRef{Kind: "HelmRelease", Source: "helm", Name: "cert-manager", Namespace: "cert-manager"}, + }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { From 12eb7d1796dc6cdac4a38451340b808c4828f4d4 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 14:33:44 +0300 Subject: [PATCH 11/33] fix(security+correctness): preflight RBAC on AI list + populate Problem.Group for built-ins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracts preflightResourceList as a shared helper between handleListResources and handleAIListResources. The AI list path previously skipped the cluster-scoped SAR (Node, PVs, cluster-scoped CRDs), the list-namespaces SAR (kind=namespaces), and the per-namespace / cluster-wide list-secrets SAR — bypassing the gates the REST list path enforces. Same gates now run in the same order on both paths; REST keeps its 200-with-[] legacy shape for denies, AI returns the explicit 403 so agents see the failure instead of confusing "empty cluster" output. Populates Problem.Group on all built-in DetectProblems sites (Deployment, StatefulSet, DaemonSet → "apps"; HPA → "autoscaling"; Job, CronJob → "batch"). The new group-aware summary_context index keys per-resource issue counts as "group|kind|ns|name", so empty Group was zeroing issueCount for every broken workload — a regression vs the pre-group-aware behavior. CAPI sites already set Group, verified. Tests: - internal/server/ai_handlers_rbac_test.go: 403 on per-namespace secrets deny, cluster-scope nodes deny, list-namespaces deny; happy-path 200 asserts summaryContext envelope. - internal/k8s/problems_test.go: fake clientset with broken Deployment + StatefulSet + DaemonSet + maxed HPA + stuck Job, asserts each emitted Problem.Group matches its canonical API group. --- internal/k8s/problems.go | 7 ++ internal/k8s/problems_test.go | 144 +++++++++++++++++++++++ internal/server/ai_handlers.go | 21 +++- internal/server/ai_handlers_rbac_test.go | 117 ++++++++++++++++++ internal/server/server.go | 76 ++++++++---- 5 files changed, 333 insertions(+), 32 deletions(-) create mode 100644 internal/k8s/problems_test.go create mode 100644 internal/server/ai_handlers_rbac_test.go diff --git a/internal/k8s/problems.go b/internal/k8s/problems.go index f5e19d452..766f98f34 100644 --- a/internal/k8s/problems.go +++ b/internal/k8s/problems.go @@ -59,6 +59,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d available", d.Status.AvailableReplicas, d.Status.Replicas), Age: FormatAge(ageDur), @@ -78,6 +79,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: "Rollout stuck", Message: cond.Message, @@ -107,6 +109,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "StatefulSet", Namespace: ss.Namespace, Name: ss.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d ready", ss.Status.ReadyReplicas, ss.Status.Replicas), Age: FormatAge(ageDur), @@ -133,6 +136,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "DaemonSet", Namespace: ds.Namespace, Name: ds.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d unavailable", ds.Status.NumberUnavailable), Age: FormatAge(ageDur), @@ -157,6 +161,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "HorizontalPodAutoscaler", Namespace: hp.Namespace, Name: hp.Name, + Group: "autoscaling", Severity: "medium", Reason: hp.Problem, Message: hp.Reason, @@ -177,6 +182,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "CronJob", Namespace: cp.Namespace, Name: cp.Name, + Group: "batch", Severity: "medium", Reason: cp.Problem, Message: cp.Reason, @@ -251,6 +257,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Job", Namespace: job.Namespace, Name: job.Name, + Group: "batch", Severity: "high", Reason: fmt.Sprintf("Running for %s with no completions", FormatAge(ageDur)), Age: FormatAge(ageDur), diff --git a/internal/k8s/problems_test.go b/internal/k8s/problems_test.go new file mode 100644 index 000000000..779b74f4f --- /dev/null +++ b/internal/k8s/problems_test.go @@ -0,0 +1,144 @@ +package k8s + +import ( + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// TestDetectProblems_PopulatesGroup pins that every built-in Problem +// emitted by DetectProblems carries the correct canonical API group. +// +// The summary_context issue index keys per-resource counts as +// "group|kind|ns|name" — a Problem with an empty Group collides with +// no real bucket, silently zeroing issueCount for that workload row. +// Pre-fix, all the built-in append-Problem sites omitted the field, so +// every broken Deployment/StatefulSet/DaemonSet/HPA/CronJob/Job +// reported issueCount: 0 in the AI list envelope — a regression +// against the pre-group-aware behavior. +// +// Construct one broken object per built-in kind, drive DetectProblems +// against a fake client, and assert each emitted Problem's Group +// matches the canonical group for its kind. +func TestDetectProblems_PopulatesGroup(t *testing.T) { + defer ResetTestState() + + oneReplica := int32(1) + minReplicas := int32(1) + now := time.Now() + // Job needs to be older than 1h to surface a "stuck" problem. + jobStart := metav1.NewTime(now.Add(-2 * time.Hour)) + + client := fake.NewClientset( + // Deployment with unavailable replicas — triggers the + // "X/Y available" Problem branch. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: appsv1.DeploymentSpec{Replicas: &oneReplica}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UnavailableReplicas: 1, + }, + }, + // StatefulSet with readyReplicas < replicas. + &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "prod"}, + Spec: appsv1.StatefulSetSpec{Replicas: &oneReplica}, + Status: appsv1.StatefulSetStatus{ + Replicas: 1, + ReadyReplicas: 0, + }, + }, + // DaemonSet with numberUnavailable > 0. + &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Name: "logger", Namespace: "prod"}, + Status: appsv1.DaemonSetStatus{ + NumberUnavailable: 2, + }, + }, + // HPA at its replica ceiling — DetectHPAProblems flags + // "maxed" when current and desired both hit MaxReplicas. + // The wrapper sets Group="autoscaling". + &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MinReplicas: &minReplicas, + MaxReplicas: 10, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + }, + }, + // Job stuck Active>0 for >1h with no completions. + &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod", CreationTimestamp: jobStart}, + Status: batchv1.JobStatus{ + Active: 1, + Succeeded: 0, + Failed: 0, + }, + }, + ) + + if err := InitTestResourceCache(client); err != nil { + t.Fatalf("InitTestResourceCache: %v", err) + } + cache := GetResourceCache() + if cache == nil { + t.Fatal("cache nil after init") + } + + // Allow informers a brief moment to populate. The fake clientset + // pre-seeds the store, but the lister types reconstruct via + // informer events on a separate goroutine. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if hasAllProblemTypes(DetectProblems(cache, "prod")) { + break + } + time.Sleep(20 * time.Millisecond) + } + + problems := DetectProblems(cache, "prod") + + wantGroup := map[string]string{ + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "HorizontalPodAutoscaler": "autoscaling", + "Job": "batch", + } + + got := make(map[string]string, len(problems)) + for _, p := range problems { + // One Problem per kind is enough for the Group assertion; + // duplicates (e.g. Deployment Available + ProgressDeadline) + // must agree on Group so the last-write-wins shape is fine. + got[p.Kind] = p.Group + } + + for kind, want := range wantGroup { + gotGroup, ok := got[kind] + if !ok { + t.Errorf("no Problem emitted for %s — fixture wiring broken; got %d problems: %+v", kind, len(problems), problems) + continue + } + if gotGroup != want { + t.Errorf("%s.Group = %q, want %q (summary_context index keys by group — empty Group zeros issueCount)", kind, gotGroup, want) + } + } +} + +func hasAllProblemTypes(problems []Problem) bool { + seen := map[string]bool{} + for _, p := range problems { + seen[p.Kind] = true + } + return seen["Deployment"] && seen["StatefulSet"] && seen["DaemonSet"] && seen["HorizontalPodAutoscaler"] && seen["Job"] +} diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 7c6c23e34..a90a5e471 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -37,16 +37,25 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return } - kind := chi.URLParam(r, "kind") - namespaces := s.parseNamespacesForUser(r) - if noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) - return - } + kind := normalizeKind(chi.URLParam(r, "kind")) group := r.URL.Query().Get("group") level := parseVerbosity(r, aicontext.LevelSummary) skipContext := r.URL.Query().Get("context") == "none" + // parseNamespacesForUser primes the per-user perm cache. preflightResourceList + // then enforces the same RBAC gates as the REST list path (cluster-scoped + // SAR for cluster-only kinds, list-namespaces SAR for `kind=namespaces`, + // per-namespace and/or cluster-wide list-secrets SAR for `kind=secrets`). + // AI callers get an explicit 403 on deny instead of the empty-list shape + // the REST handler returns for backward compat. + namespaces := s.parseNamespacesForUser(r) + finalNamespaces, status, msg, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeError(w, status, msg) + return + } + namespaces = finalNamespaces + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go new file mode 100644 index 000000000..b55d984cf --- /dev/null +++ b/internal/server/ai_handlers_rbac_test.go @@ -0,0 +1,117 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + + "github.com/skyhook-io/radar/internal/auth" +) + +// AI list path RBAC at the /api/ai/resources/{kind} layer. +// +// handleAIListResources shares preflightResourceList with +// handleListResources so the same gates run on both paths: +// - cluster-scoped SAR for Node / cluster-scoped CRDs +// - list-namespaces SAR for `kind=namespaces` +// - per-namespace and/or cluster-wide list-secrets SAR for `kind=secrets` +// +// Where the REST path returns 200 with `[]` for denies (legacy SPA +// shape that doesn't leak kind existence), the AI path returns the +// explicit status so agents see the failure instead of confusing +// "empty cluster" output. + +func TestAI_SecretsList_PerNamespaceDenied_Returns403(t *testing.T) { + // alice has namespace access to default but per-namespace + // `list secrets` is denied. preflightResourceList must intercept + // before reaching the cache. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretListCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secrets?namespace=default", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI secrets list with per-namespace deny, got %d", resp.StatusCode) + } +} + +func TestAI_NodesList_NoClusterRBAC_Returns403(t *testing.T) { + // Nodes are cluster-scoped. Cluster-wide pod visibility + // (AllowedNamespaces nil sentinel) is not a license to read + // cluster-scoped kinds — the SAR-level gate must reject. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/nodes", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI nodes list without cluster-scope RBAC, got %d", resp.StatusCode) + } +} + +func TestAI_NamespacesList_NoListNamespacesSAR_Returns403(t *testing.T) { + // /api/ai/resources/namespaces returns full Namespace objects. + // Strict SAR gate — cluster-wide pod RBAC alone is not sufficient. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "namespaces", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/namespaces", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI namespaces list without list-namespaces SAR, got %d", resp.StatusCode) + } +} + +func TestAI_DeploymentsList_HappyPath_AttachesSummaryContext(t *testing.T) { + // Allowed user, summary-verbosity default. The envelope must + // include the seeded nginx deployment AND each row must carry a + // summaryContext field (the load-bearing new wire shape this PR + // adds — pin it so a refactor that skipped attachment surfaces + // here). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/deployments", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + var rows []map[string]any + if err := json.NewDecoder(resp.Body).Decode(&rows); err != nil { + t.Fatalf("decode: %v", err) + } + if len(rows) == 0 { + t.Fatalf("allowed user got 0 deployments, expected seeded nginx") + } + + // AI list rows are flat (kind/name/namespace at the top level — + // the minified shape, distinct from the REST handler's K8s-native + // metadata-nested objects). Find the nginx row and assert + // summaryContext is present. Empty map is acceptable (the + // deployment is healthy and not managed by an external + // controller) — what matters is the envelope field exists so + // consumers don't have to special-case its absence. + var found bool + for _, row := range rows { + if row["name"] != "nginx" { + continue + } + found = true + if _, has := row["summaryContext"]; !has { + t.Errorf("nginx row missing summaryContext envelope: %+v", row) + } + } + if !found { + t.Errorf("nginx deployment not in AI list response: %+v", rows) + } +} diff --git a/internal/server/server.go b/internal/server/server.go index b4e350187..6c0b6703a 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1010,17 +1010,20 @@ func (s *Server) handleAPIResources(w http.ResponseWriter, r *http.Request) { s.writeJSON(w, resources) } -func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // parseNamespacesForUser primes the per-user perm cache (triggers - // DiscoverNamespaces if needed). canRead below relies on it. - namespaces := s.parseNamespacesForUser(r) - +// preflightResourceList runs the per-user RBAC gates shared by the REST +// (/api/resources/{kind}) and AI (/api/ai/resources/{kind}) list paths. +// It assumes the caller has already populated `namespaces` via +// parseNamespacesForUser (which primes the canI cache that canRead relies on) +// and has classified the kind for cluster-scope. +// +// Returns the (possibly-rewritten) namespace slice that downstream cache +// reads should use. When ok=false the gate denied or the user has no +// namespace access; (status, msg) carry the canonical HTTP response. REST +// callers historically convert denies to a 200 with `[]` to avoid leaking +// kind existence; the AI path returns the explicit status so agents see the +// failure. Same gates run in the same order on both paths — the response +// shape is the only thing that differs. +func (s *Server) preflightResourceList(r *http.Request, kind, group string, namespaces []string) (finalNamespaces []string, status int, msg string, ok bool) { // "namespaces" is cluster-scoped at the K8s API. Full Namespace objects // (labels, annotations, spec) require explicit list-namespaces SAR. // AllowedNamespaces is NOT a sufficient fallback: list-pods-in-alpha @@ -1032,10 +1035,9 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { isNamespacesKind := kind == "namespaces" || kind == "namespace" if isNamespacesKind { if !s.canRead(r, "", "namespaces", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list namespaces", false } - namespaces = nil // full lister output for SAR-authorized users + return nil, 0, "", true // full lister output for SAR-authorized users } // Cluster-only kinds (Nodes, PVs, StorageClasses, ClusterRoles, cluster- @@ -1043,19 +1045,19 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // noNamespaceAccess check so a user with explicit cluster-scoped RBAC but // no namespace access can still read those resources. isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) - if isClusterScoped && !isNamespacesKind { + if isClusterScoped { if !s.canRead(r, gvrGroup, gvrResource, "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, fmt.Sprintf("insufficient permissions to list %s", kind), false } // Cluster-scoped reads have no namespace dimension. Once the // resource-level SAR passes, force the later typed/dynamic cache paths // through their cluster-wide branch even if the user also has a // namespace view preference. - namespaces = nil - } else if !isNamespacesKind && noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) - return + return nil, 0, "", true + } + + if noNamespaceAccess(namespaces) { + return namespaces, http.StatusForbidden, "no namespace access", false } // Per-kind RBAC inside a namespace. Helm release storage IS K8s Secrets, @@ -1064,26 +1066,48 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // radar/templates/clusterrole.yaml). When any of those triggers fires // the cache holds every secret in the cluster, so per-user RBAC must // gate the read. Other namespaced kinds are deferred. - if (kind == "secrets" || kind == "secret") && !isClusterScoped { + if kind == "secrets" || kind == "secret" { if auth.UserFromContext(r.Context()) != nil { if namespaces == nil { // Auth user with cluster-wide namespace access (e.g. picked up // via DiscoverNamespaces stage 1: cluster-wide list pods). The // cache will serve all secrets — gate on cluster-scope SAR. if !s.canRead(r, "", "secrets", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list secrets", false } } else { namespaces = s.filterNamespacesByCanRead(r, "", "secrets", "list", namespaces) if len(namespaces) == 0 { - s.writeJSON(w, []any{}) - return + return namespaces, http.StatusForbidden, "insufficient permissions to list secrets", false } } } } + return namespaces, 0, "", true +} + +func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // parseNamespacesForUser primes the per-user perm cache (triggers + // DiscoverNamespaces if needed). canRead below relies on it. + namespaces := s.parseNamespacesForUser(r) + + // Shared RBAC gate. REST converts denies to 200 with `[]` (legacy shape + // the SPA tolerates and that doesn't leak kind existence); the AI path + // returns the explicit status. + finalNamespaces, _, _, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeJSON(w, []any{}) + return + } + namespaces = finalNamespaces + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") From 2f4ecaa5119e0ec77f416974c36f6ee91aeca766 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:10:19 +0300 Subject: [PATCH 12/33] perf(summaryContext): memoize MCP topology builds + fix misleading filter comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer's important (#722): MCP list_resources and search build a full topology snapshot on every invocation. REST reuses the broadcaster's cached snapshot but MCP has no equivalent shared cache, so on a multi-thousand-resource cluster every agent list/search paid a multi-second topology build cost. Fix: introduce a package-level topology.Memoizer (5s TTL — matches the REST broadcaster's cadence) and route buildSummaryContextTopology through it. Bursty agent traffic now amortizes the build cost over many calls instead of paying it per request. Other MCP tools (handleGetResource, get_neighborhood) still build inline; threading them through is a separate follow-up. Suggestion: also fix the misleading comment in internal/server/summary_context.go that claimed "Sources are restricted to 'problem' + 'condition'". Filters.Sources was never set — the exclusion of audit/event sources comes from the false-by-default IncludeAudit / IncludeEvents flags. Updated the comment to reference the actual mechanism so future readers don't assume there's an explicit Sources allowlist. --- internal/mcp/summary_context.go | 30 ++++++++++++++++++++++-------- internal/server/summary_context.go | 13 +++++++------ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 364f1280e..935e7c6c8 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -7,6 +7,7 @@ package mcp import ( "strings" + "time" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" @@ -83,23 +84,36 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon } } -// buildSummaryContextTopology builds a topology snapshot suitable for -// resolving managedBy pointers. MCP has no shared broadcaster cache, -// so we build directly via the builder. Returns nil on failure — the -// caller falls back to a managedBy-less SummaryContext rather than -// failing the response. +// summaryCtxTopoMemo caches topology builds across summary-context list and +// search invocations. MCP has no shared broadcaster cache, so without +// memoization every list_resources / search call from an agent pays a +// full topology build (multi-second on multi-thousand-resource clusters). +// 5s TTL matches the REST broadcaster's cadence — short enough that +// managedBy stays current after a context switch, long enough that a +// burst of agent calls amortizes the build cost. +// +// Other MCP tools (handleGetResource, get_neighborhood) still build +// inline; threading them through here is a separate follow-up. +var summaryCtxTopoMemo = topology.NewMemoizer(5 * time.Second) + +// buildSummaryContextTopology returns a topology snapshot suitable for +// resolving managedBy pointers, reusing a cached snapshot when one is +// fresh. Returns nil on failure — the caller falls back to a +// managedBy-less SummaryContext rather than failing the response. func buildSummaryContextTopology(namespaces []string) *topology.Topology { cache := k8s.GetResourceCache() if cache == nil { return nil } - builder := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). - WithDynamic(k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery())) opts := topology.DefaultBuildOptions() if len(namespaces) > 0 { opts.Namespaces = namespaces } - topo, err := builder.Build(opts) + topo, err := summaryCtxTopoMemo.Get(opts, func() (*topology.Topology, error) { + builder := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery())) + return builder.Build(opts) + }) if err != nil { return nil } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index 80bc9201d..953174422 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -50,12 +50,13 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string return nil } - // One pass over the issue engine; group by kind/ns/name. Sources - // are restricted to "problem" + "condition" — the two always-on - // surfaces that match the default /api/issues + MCP issues_list - // behavior. Audit + Warning events are loud and require explicit - // opt-in; rolling them into the per-row count would distort - // "this Pod has 1 issue" for the common case. + // One pass over the issue engine; group by group/kind/ns/name. We + // rely on Filters.IncludeAudit and Filters.IncludeEvents staying + // false-by-default in buildIssueIndex — that's what keeps the + // per-row count to "problem" + "condition" only. Audit + Warning + // events are loud and require explicit opt-in; rolling them into + // the per-row count would distort "this Pod has 1 issue" for the + // common case. idx := buildIssueIndex(provider, namespaces, kindFilter) resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) From 8ce001ad7dcd0f4bca375b387f22438b805c0873 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:44:32 +0300 Subject: [PATCH 13/33] fix(summaryContext): cluster-scoped issueCount + CRD scan perf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three correctness/perf fixes on the summaryContext list path: 1. Cluster-scoped issues silently filtered out for cluster-scoped rows. handleAIListResources / handleListResources passed the user's namespaced-access set straight into the issue index. Cluster-scoped issues (Node, PV, cluster-scoped CRDs) live at namespace="" — the namespaced filter dropped them all, zeroing issueCount on every Node row even when the user had cluster-scoped Node access. Now: detect cluster-scoped kinds via k8s.ClassifyKindScope and pass nil for the issue index (cluster-wide compose). RBAC was already enforced upstream. 2. detectGenericCRDIssues scanned every watched CRD before kindFilter applied. A pods-only list_resources request still iterated every watched CRD GVR and called ListDynamic on each; applyFilters then discarded the rows. On clusters with hundreds of CRDs this dominated the per-row issue index build. Push f.Kinds awareness down — match the GVR's kind via p.KindForGVR (lowercase comparison mirrors applyFilters) and skip the ListDynamic call entirely for non-matching GVRs. 3. Trailing blank line at EOF in internal/mcp/summary_context.go. Tests: - internal/server: issueIndexNamespaces helper drops the namespace filter for cluster-scoped kinds and preserves it for namespaced kinds. buildIssueIndex end-to-end: a Node issue at namespace="" surfaces when the index is built without a namespace filter and disappears when a namespace slice is passed (matches CacheProvider's per-ns walk). - internal/mcp: mirror of the server test against the MCP buildIssueIndex. - internal/issues: countingProvider asserts detectGenericCRDIssues does NOT call ListDynamic for GVRs whose kind is filtered out, and DOES call it for matching GVRs and when Kinds is empty. Fake provider's DetectProblems in both server + MCP tests updated to mirror CacheProvider.flattenNamespacedProblems (drop cluster-scoped rows under a namespace filter) so the regression test pins the actual production behavior. --- internal/issues/issues.go | 22 +++++ internal/issues/issues_test.go | 84 ++++++++++++++++ internal/mcp/summary_context.go | 1 - internal/mcp/summary_context_test.go | 54 ++++++++++- internal/mcp/tools.go | 31 +++++- internal/server/ai_handlers.go | 25 ++++- internal/server/summary_context_test.go | 124 +++++++++++++++++++++++- 7 files changed, 331 insertions(+), 10 deletions(-) diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 3ca644a24..6849ebf92 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -218,11 +218,24 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { // warning Issue for each object that has a False Ready/Available/etc. // condition. Skips kinds owned by curated checkers (Cluster API today) // to avoid double-reporting. +// +// When f.Kinds is non-empty (e.g. summaryContext building a per-resource +// issue index for a list_resources call on a single kind), GVRs whose +// kind isn't in the filter are skipped BEFORE the ListDynamic call — +// without this gate, a pods-only request still scanned every watched +// CRD up front and applyFilters discarded the rows afterward. Kind +// comparison mirrors applyFilters: lowercase for case-insensitive +// match against the user's filter (which itself is canonicalized to +// the singular form upstream). func detectGenericCRDIssues(p Provider, f Filters) []Issue { gvrs := p.WatchedDynamic() if len(gvrs) == 0 { return nil } + wantKind := map[string]bool{} + for _, k := range f.Kinds { + wantKind[strings.ToLower(k)] = true + } var out []Issue for _, gvr := range gvrs { if isCuratedCRDGroup(gvr.Group) { @@ -232,6 +245,15 @@ func detectGenericCRDIssues(p Provider, f Filters) []Issue { if kind == "" { continue } + // applyFilters runs after Compose returns — but on hot paths that + // pin a single kind (summaryContext per-row index), routing the + // kind filter through here skips the per-GVR ListDynamic call + // entirely. Match in lowercase (same as applyFilters) so + // "Pod"/"pod" and CRD-typed "MyResource"/"myresource" both + // compare equal. + if len(wantKind) > 0 && !wantKind[strings.ToLower(kind)] { + continue + } clusterScoped, _, _ := classifyDynamicScope(p, gvr, kind) if clusterScoped && f.CanReadClusterScoped != nil && !f.CanReadClusterScoped(kind, gvr.Group) { continue diff --git a/internal/issues/issues_test.go b/internal/issues/issues_test.go index 2e774746a..beb995725 100644 --- a/internal/issues/issues_test.go +++ b/internal/issues/issues_test.go @@ -622,3 +622,87 @@ func TestFlattenNamespacedProblems_EmptyInputReturnsNil(t *testing.T) { t.Errorf("empty input should produce empty output, got %+v", out) } } + +// countingProvider wraps fakeProvider and tallies ListDynamic calls per +// GVR. Used by TestDetectGenericCRDIssues_SkipsListWhenKindFiltered to +// pin that detectGenericCRDIssues short-circuits the per-GVR +// ListDynamic call when f.Kinds excludes the GVR's kind — on clusters +// with hundreds of watched CRDs, scanning every one for a pods-only +// summaryContext request was the dominant cost. +type countingProvider struct { + fakeProvider + listCalls map[schema.GroupVersionResource]int +} + +func (c *countingProvider) ListDynamic(gvr schema.GroupVersionResource, ns string) ([]*unstructured.Unstructured, error) { + if c.listCalls == nil { + c.listCalls = map[schema.GroupVersionResource]int{} + } + c.listCalls[gvr]++ + return c.fakeProvider.ListDynamic(gvr, ns) +} + +// TestDetectGenericCRDIssues_SkipsListWhenKindFiltered pins the +// "scan all CRDs before kindFilter applies" perf fix in +// detectGenericCRDIssues. Pre-fix, a Compose call with Kinds=["Pod"] +// still iterated every watched CRD GVR and ran ListDynamic on each; +// applyFilters then discarded the non-matching rows at the end. +// +// On a cluster with hundreds of watched CRDs this dominated the +// summaryContext per-row index build for list_resources kind=pods. +// The fix routes f.Kinds awareness into detectGenericCRDIssues so +// non-matching GVRs skip the ListDynamic call entirely. +func TestDetectGenericCRDIssues_SkipsListWhenKindFiltered(t *testing.T) { + podGVR := schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} + appGVR := schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "applications"} + npGVR := schema.GroupVersionResource{Group: "karpenter.sh", Version: "v1", Resource: "nodepools"} + + p := &countingProvider{ + fakeProvider: fakeProvider{ + dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{ + podGVR: {}, // empty — only counts the call. + appGVR: {{Object: map[string]any{ + "metadata": map[string]any{"name": "a", "namespace": "argocd"}, + "status": map[string]any{ + "conditions": []any{ + map[string]any{"type": "Synced", "status": "False", "reason": "Drift"}, + }, + }, + }}}, + npGVR: {}, // empty — only counts the call. + }, + kinds: map[schema.GroupVersionResource]string{ + podGVR: "Pod", + appGVR: "Application", + npGVR: "NodePool", + }, + }, + } + + // kindFilter restricts to Application — the other two GVRs must NOT + // be listed. detectGenericCRDIssues lowercases the kind comparison + // (mirrors applyFilters), so the canonical "Application" matches the + // emitted Kind for the argoproj.io GVR. + _ = detectGenericCRDIssues(p, Filters{Kinds: []string{"Application"}}) + + if got := p.listCalls[podGVR]; got != 0 { + t.Errorf("Pod GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[npGVR]; got != 0 { + t.Errorf("NodePool GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[appGVR]; got == 0 { + t.Errorf("Application GVR ListDynamic calls = %d, want >= 1 (matching kind must still be scanned)", got) + } + + // Sanity: empty Kinds filter scans every GVR (no per-kind shortcut + // when caller didn't ask for one). Pins that the fix is filter-aware + // rather than always-skip. + p.listCalls = nil + _ = detectGenericCRDIssues(p, Filters{}) + for gvr, want := range map[schema.GroupVersionResource]bool{podGVR: true, appGVR: true, npGVR: true} { + if got := p.listCalls[gvr] > 0; got != want { + t.Errorf("no kind filter: GVR %s called=%v, want %v", gvr.Resource, got, want) + } + } +} diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 935e7c6c8..366f596c4 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -222,4 +222,3 @@ func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.Ma } return nil } - diff --git a/internal/mcp/summary_context_test.go b/internal/mcp/summary_context_test.go index 9289abb6c..5b0ba76d7 100644 --- a/internal/mcp/summary_context_test.go +++ b/internal/mcp/summary_context_test.go @@ -20,11 +20,35 @@ import ( // fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex // tests. Only the fields the index path touches are wired. +// +// DetectProblems mirrors CacheProvider.DetectProblems: empty namespaces +// returns the full set; a non-empty slice drops cluster-scoped rows +// (Namespace=="") to match the production flattenNamespacedProblems +// behavior — needed so the cluster-scoped-filter regression test can +// pin the actual bug. type fakeIssuesProvider struct { problems []k8s.Problem } -func (f *fakeIssuesProvider) DetectProblems(_ []string) []k8s.Problem { return f.problems } +func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { + if len(namespaces) == 0 { + return f.problems + } + allowed := map[string]bool{} + for _, ns := range namespaces { + allowed[ns] = true + } + out := make([]k8s.Problem, 0, len(f.problems)) + for _, p := range f.problems { + if p.Namespace == "" { + continue + } + if allowed[p.Namespace] { + out = append(out, p) + } + } + return out +} func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { @@ -100,3 +124,31 @@ func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { t.Errorf("head pod count = %d, want 1", got) } } + +// TestBuildIssueIndex_ClusterScopedIssueRequiresUnfilteredCompose pins +// the MCP-side regression for the cluster-scoped issueCount bug. When +// handleListResources hands a namespace-restricted slice to the issue +// index, cluster-scoped issues (Namespace=="") are dropped by Compose's +// per-namespace problem walk — every Node row gets issueCount=0 even +// when the user has cluster-scoped Node access. The fix routes +// clusterScoped through and forces idxNamespaces=nil before calling +// newSummaryContextBuilder; this test pins the buildIssueIndex behavior +// that backs that path. +func TestBuildIssueIndex_ClusterScopedIssueRequiresUnfilteredCompose(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + }, + } + // Cluster-wide compose surfaces the Node issue. + idx := buildIssueIndex(p, nil, "Node") + if got := idx.count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("cluster-wide index: Node issueCount = %d, want 1", got) + } + // Namespace-scoped compose drops the same issue — what the pre-fix + // MCP handler did on every Node list for a namespace-restricted user. + scopedIdx := buildIssueIndex(p, []string{"prod", "staging"}, "Node") + if got := scopedIdx.count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) + } +} diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index e13ea2a8e..95bbb7026 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -473,8 +473,11 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs. ClassifyKindScope/SAR // above already authorized cluster-scoped CRDs; namespaced CRDs - // are scoped via listScope. - return listDynamicResources(ctx, cache, kind, group, listScope, input.Context) + // are scoped via listScope. Pass clusterScoped through so the + // issue index drops the namespace filter for cluster-scoped + // CRDs — those issues live at namespace="" and would otherwise + // be filtered out by the user's namespaced-access set. + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) } if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) @@ -499,8 +502,18 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li // Attach summaryContext per row unless caller opted out. Issue index // is scoped to the listed kind so the per-row count reflects only // the resource being listed (not unrelated noise in the namespace). + // + // Cluster-scoped kinds (Node, PV, cluster-scoped CRDs) emit issues + // at namespace="" — scoping the index to the user's namespaced + // access set would silently zero issueCount on every row. The + // cluster-scoped RBAC gate above (canReadClusterScopedKind) already + // authorized the read, so we pass nil here to compose cluster-wide. if input.Context != "none" { - if builder := newSummaryContextBuilder(allowed, kind); builder != nil { + idxNamespaces := allowed + if clusterScoped { + idxNamespaces = nil + } + if builder := newSummaryContextBuilder(idxNamespaces, kind); builder != nil { attachSummaryContextToTyped(results, objs, builder) } } @@ -530,7 +543,7 @@ func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder s } } -func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, contextMode string) (*mcp.CallToolResult, any, error) { +func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, clusterScoped bool, contextMode string) (*mcp.CallToolResult, any, error) { var rawItems []*unstructured.Unstructured if len(namespaces) > 0 { for _, ns := range namespaces { @@ -554,7 +567,15 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g } if contextMode != "none" { - if builder := newSummaryContextBuilder(namespaces, kind); builder != nil { + // Cluster-scoped CRDs emit issues at namespace="" — passing a + // namespace-restricted slice would silently zero issueCount on + // every row. Caller has already gated cluster-scoped reads via + // canReadClusterScopedKind, so cluster-wide compose is safe. + idxNamespaces := namespaces + if clusterScoped { + idxNamespaces = nil + } + if builder := newSummaryContextBuilder(idxNamespaces, kind); builder != nil { attachSummaryContextToUnstructured(allItems, rawItems, builder) } } diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index a90a5e471..7c136ff50 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -87,8 +87,15 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // Attach summaryContext per row at Summary verbosity. Compact/Detail // already carry richer context on the get-resource path; the // per-row attachment is specifically for cheap list triage. + // + // For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) issues + // live at namespace="" — scoping the issue index to the user's + // namespace set would silently zero issueCount on every row. The + // preflight RBAC above has already authorized cluster-scoped reads, + // so we pass nil here to compose cluster-wide. if !skipContext && level == aicontext.LevelSummary { - if builder := s.newSummaryContextBuilder(namespaces, kind); builder != nil { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newSummaryContextBuilder(idxNamespaces, kind); builder != nil { // Typed list resolves group from each object's TypeMeta — // MinifyList sets it via SetTypeMeta before producing rows, // so we can trust apiVersion on the typed source. @@ -99,6 +106,19 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { s.writeJSON(w, results) } +// issueIndexNamespaces returns the namespace slice to scope the issue +// index by. For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) +// returns nil so cluster-scoped issues (which live at namespace="") are +// not filtered out by the user's namespace-restricted access set. +// Namespaced kinds pass through unchanged. +func issueIndexNamespaces(namespaces []string, kind, group string) []string { + clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group) + if clusterScoped { + return nil + } + return namespaces +} + // attachSummaryContextToList walks the typed-cache list and assigns the // per-row SummaryContext into each ResourceSummary in-place. results and // objs are produced in lockstep by MinifyList; a length mismatch is @@ -200,7 +220,8 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 } if !skipContext && level == aicontext.LevelSummary { - if builder := s.newSummaryContextBuilder(namespaces, kind); builder != nil { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newSummaryContextBuilder(idxNamespaces, kind); builder != nil { attachSummaryContextToUnstructuredList(results, allItems, builder) } } diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index a40477333..6b84a5bcb 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -32,11 +32,39 @@ var issuesMaxLimit = issues.MaxLimit // fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex // tests. Only the fields the index path touches are wired; the CRD- // condition fallback path is exercised by issues' own tests. +// +// DetectProblems mirrors CacheProvider.DetectProblems' shape: +// namespaces=nil returns the full set (including cluster-scoped rows at +// namespace=""); a non-empty namespaces slice drops cluster-scoped rows +// (matching flattenNamespacedProblems) so per-row tests can pin the +// "cluster-scoped issue silently filtered" behavior the production code +// exhibits. type fakeIssuesProvider struct { problems []k8s.Problem } -func (f *fakeIssuesProvider) DetectProblems(_ []string) []k8s.Problem { return f.problems } +func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { + if len(namespaces) == 0 { + return f.problems + } + allowed := map[string]bool{} + for _, ns := range namespaces { + allowed[ns] = true + } + out := make([]k8s.Problem, 0, len(f.problems)) + for _, p := range f.problems { + // Cluster-scoped problems (Namespace=="") are dropped under a + // namespace filter — matches flattenNamespacedProblems in the + // production CacheProvider. + if p.Namespace == "" { + continue + } + if allowed[p.Namespace] { + out = append(out, p) + } + } + return out +} func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { @@ -354,3 +382,97 @@ func contains(s, sub string) bool { } return false } + +// TestIssueIndexNamespaces_ClusterScopedDropsFilter pins the fix for the +// "cluster-scoped issues filtered out for cluster-scoped rows" bug. +// Pre-fix, handleAIListResources passed the user's namespaced-access set +// straight into the issue index. For cluster-scoped kinds (Node, PV, +// cluster-scoped CRDs) every issue lives at namespace="" — the index +// then dropped them all, silently zeroing issueCount on every row even +// when the user had cluster-scoped read access. The helper now returns +// nil for cluster-scoped kinds so Compose runs cluster-wide. +func TestIssueIndexNamespaces_ClusterScopedDropsFilter(t *testing.T) { + userNs := []string{"prod", "staging"} + + // Cluster-scoped built-ins from the static catalogue (ClassifyKindScope + // hits ClusterOnlyKindGVR before touching discovery, so this works + // without a discovery client wired up). + clusterCases := []struct { + kind string + group string + }{ + {"Node", ""}, + {"nodes", ""}, + {"PersistentVolume", ""}, + {"ClusterRole", "rbac.authorization.k8s.io"}, + {"StorageClass", "storage.k8s.io"}, + } + for _, tc := range clusterCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if got != nil { + t.Errorf("issueIndexNamespaces(%q, %q) = %v, want nil — cluster-scoped kinds must not be namespace-filtered", + tc.kind, tc.group, got) + } + } + + // Namespaced kinds preserve the user's namespace set as-is so the + // scoping the per-user RBAC enforced upstream is honored. + namespacedCases := []struct { + kind string + group string + }{ + {"Pod", ""}, + {"Deployment", "apps"}, + {"ConfigMap", ""}, + } + for _, tc := range namespacedCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if len(got) != len(userNs) { + t.Errorf("issueIndexNamespaces(%q, %q) len = %d, want %d (namespace filter must pass through for namespaced kinds)", + tc.kind, tc.group, len(got), len(userNs)) + continue + } + for i := range got { + if got[i] != userNs[i] { + t.Errorf("issueIndexNamespaces(%q, %q)[%d] = %q, want %q", + tc.kind, tc.group, i, got[i], userNs[i]) + } + } + } + + // Pass-through when caller already provided nil (cluster-wide). + if got := issueIndexNamespaces(nil, "Pod", ""); got != nil { + t.Errorf("issueIndexNamespaces(nil, Pod) = %v, want nil", got) + } +} + +// TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the +// end-to-end behavior the issueIndexNamespaces helper enables: when the +// builder passes nil for the namespace filter (cluster-scoped kind), +// node-level issues at namespace="" surface in the index and the +// per-resource lookup returns the correct count. With a namespace +// filter populated, those same issues are dropped because Compose's +// per-namespace problem walk never sees them. +func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8sProblem{ + // Cluster-scoped Node issue: namespace="" — the actual shape + // k8s.DetectProblems emits for NodeNotReady / DiskPressure etc. + {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + }, + } + + // Cluster-wide compose (nil namespaces) — issue surfaces. + idx := buildIssueIndex(p, nil, "Node") + if got := idx.count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) + } + + // Namespace-scoped compose — same issue, but ns filter to ["prod","staging"] + // drops it because the user-namespaced perm slice never matches "". + // This is what the pre-fix handler did for Node lists. + scopedIdx := buildIssueIndex(p, []string{"prod", "staging"}, "Node") + if got := scopedIdx.count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) + } +} From d1b95ed626d09cd0f27b3321ba4a840ed17a1cea Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 17:28:08 +0300 Subject: [PATCH 14/33] fix(summaryContext+ai-list): dual-index search + group-aware list routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two residual findings on the T89 work: 1. Search summaryContext dropped issueCount for cluster-scoped hits. handleSearch passed scanNamespaces (a strictly-namespaced filter) into the single issue index used for every hit. Search returns MIXED-kind hits (a query can return both namespaced Pods and cluster-scoped Nodes), and cluster-scoped problems live at namespace="" — the per-namespace filter dropped them, silently zeroing issueCount on every Node/PV/cluster-scoped-CRD hit. Fix: dual issue index per search request. namespacedIdx is scoped to scanNamespaces; clusterIdx is composed cluster-wide (nil filter) so namespace="" issues surface. The summaryContextBuilder closure dispatches per-hit via k8s.ClassifyKindScope(kind, group), so a single response correctly counts both Pod and Node issues. CanReadClusterScoped already gates which cluster-scoped kinds the user can see — the cluster-wide index doesn't expose unauthorized rows. Mirrored on the MCP search path. 2. AI list with group ignored group and tried typed cache first. handleAIListResources called k8s.FetchResourceList(cache, kind, namespaces) before consulting group, so kind=services&group= serving.knative.dev silently returned core Services. Same fix shape PR #721 applied to GET: group != "" short-circuits straight to aiListDynamic. Mirrored on MCP handleListResources. Tests: - server: TestSummaryContextBuilderFromIndexes_DispatchesByScope and TestNewSearchSummaryContextBuilder_BuildsDualIndex pin the per-hit scope-based dispatch and the dual-index construction shape. - server: TestAI_ListServices_WithGroup_RoutesToDynamicCache pins the group routing on /api/ai/resources/{kind} (typed core Service for no group; dynamic-cache path for group=serving.knative.dev). - mcp: same two builder tests plus TestHandleListResources_GroupRoutesToDynamic for the MCP list path. --- internal/mcp/summary_context.go | 58 ++++++++++++- internal/mcp/summary_context_test.go | 65 ++++++++++++++ internal/mcp/tools.go | 21 ++++- internal/mcp/tools_filter_test.go | 40 +++++++++ internal/server/ai_handlers.go | 14 ++- internal/server/ai_handlers_rbac_test.go | 62 +++++++++++++ internal/server/search_handler.go | 10 ++- internal/server/summary_context.go | 58 ++++++++++++- internal/server/summary_context_test.go | 106 +++++++++++++++++++++++ 9 files changed, 424 insertions(+), 10 deletions(-) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 366f596c4..1e72b154c 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -29,21 +29,62 @@ import ( type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext // newSummaryContextBuilder assembles the per-request closure for MCP -// list_resources / search. Returns nil when the cache or topology -// isn't available, in which case the caller should skip context -// attachment rather than emit empty objects. +// list_resources. Returns nil when the cache or topology isn't +// available, in which case the caller should skip context attachment +// rather than emit empty objects. // // namespaces scopes the issue index to just the rows being returned; // pass nil for cluster-wide. kindFilter ("" for search, the requested // kind for list_resources) narrows the issue compose to a single kind // so list_resources kind=pod doesn't pull deployment + service issues. +// +// Use newSearchSummaryContextBuilder for MCP search, which routes +// per-hit between a namespaced and a cluster-wide index — search +// returns mixed kinds in one response, so a single index can't get +// both right. func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { provider := issues.NewCacheProvider() if provider == nil { return nil } - topo := buildSummaryContextTopology(namespaces) idx := buildIssueIndex(provider, namespaces, kindFilter) + return summaryContextBuilderFromIndexes(namespaces, idx, idx) +} + +// newSearchSummaryContextBuilder is the MCP search variant. Mirrors +// internal/server.newSearchSummaryContextBuilder — see that comment for +// the dual-index rationale (mixed-kind hits, cluster-scoped issues at +// namespace=""). MCP search-level RBAC (CanReadClusterScoped via +// canReadClusterScopedKind) already gates which cluster-scoped kinds +// are reachable, so composing the cluster-wide index doesn't leak +// rows the user can't see. +func newSearchSummaryContextBuilder(scanNamespaces []string) summaryContextBuilder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := buildIssueIndex(provider, scanNamespaces, "") + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = buildIssueIndex(provider, nil, "") + } + return summaryContextBuilderFromIndexes(scanNamespaces, namespacedIdx, clusterIdx) +} + +// summaryContextBuilderFromIndexes is the shared closure body. The list +// path passes the same index for both args; search passes two distinct +// indexes (namespacedIdx scoped to user namespaces, clusterIdx composed +// cluster-wide). The closure dispatches per-hit by scope so cluster- +// scoped hits read the cluster-wide index and surface namespace="" +// issues that the namespaced filter would otherwise drop. +// +// topoNamespaces is the namespace hint for the topology build — +// search passes the same scanNamespaces it used for the namespaced +// index; list passes its allowed-namespace set. Topology snapshot is +// memoized; passing the same hint hits the cache across list and +// search invocations in a burst. +func summaryContextBuilderFromIndexes(topoNamespaces []string, namespacedIdx, clusterIdx issueIndex) summaryContextBuilder { + topo := buildSummaryContextTopology(topoNamespaces) resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) @@ -77,6 +118,15 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon if source == nil && u != nil { source = u } + // Dispatch by scope: cluster-scoped hits read clusterIdx (composed + // at namespace=nil so namespace="" issues are present), namespaced + // hits read namespacedIdx (which honors the user's namespace + // filter so the per-row count doesn't pull in noise from + // namespaces the user can't see). + idx := namespacedIdx + if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { + idx = clusterIdx + } return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ ManagedBy: managedBy, IssueCount: idx.count(group, kind, namespace, name), diff --git a/internal/mcp/summary_context_test.go b/internal/mcp/summary_context_test.go index 5b0ba76d7..38df5988b 100644 --- a/internal/mcp/summary_context_test.go +++ b/internal/mcp/summary_context_test.go @@ -125,6 +125,71 @@ func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { } } +// TestSummaryContextBuilderFromIndexes_DispatchesByScope pins the +// dual-index dispatch on the MCP path. Search returns mixed-kind hits +// (namespaced Pods + cluster-scoped Nodes); a single namespace-scoped +// index would zero issueCount on the Node hits because their problems +// live at namespace="". Mirror of the REST-side test in +// internal/server. +func TestSummaryContextBuilderFromIndexes_DispatchesByScope(t *testing.T) { + namespacedIdx := issueIndex{} + namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 + + clusterIdx := issueIndex{} + clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 + + build := summaryContextBuilderFromIndexes(nil, namespacedIdx, clusterIdx) + + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { + t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { + t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) + } + // Cross-bucket name lookups must not leak. + if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) + } +} + +// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the +// constructor: scanNamespaces non-nil → two distinct indexes, one +// scoped, one cluster-wide. Without this, MCP search responses zero +// out issueCount on Node / PV / cluster-scoped CRD hits. Mirror of the +// REST-side test. +func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, + }, + } + + namespacedIdx := buildIssueIndex(p, []string{"prod"}, "") + clusterIdx := buildIssueIndex(p, nil, "") + + if got := namespacedIdx.count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespacedIdx Node count = %d, want 0 (sanity)", got) + } + if got := clusterIdx.count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("clusterIdx Node count = %d, want 1", got) + } + if got := namespacedIdx.count("", "Pod", "prod", "api-7"); got != 1 { + t.Errorf("namespacedIdx Pod count = %d, want 1", got) + } + + build := summaryContextBuilderFromIndexes(nil, namespacedIdx, clusterIdx) + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) + } +} + // TestBuildIssueIndex_ClusterScopedIssueRequiresUnfilteredCompose pins // the MCP-side regression for the cluster-scoped issueCount bug. When // handleListResources hands a namespace-restricted slice to the issue diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 95bbb7026..ef491730e 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -468,7 +468,18 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li listScope = nil } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service) reach the right + // resource. FetchResourceList is group-blind — it would silently + // return the core typed list, dropping the caller's group filter on + // the floor. Mirrors the group-aware short-circuit in REST + // handleAIListResources and handleGetResource (PR #721). + if group != "" { + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, listScope) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs. ClassifyKindScope/SAR @@ -2152,8 +2163,14 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp } opts.Filter = f } + // Search uses the dual-index variant: hits are mixed-kind (a single + // query can return both namespaced Pods and cluster-scoped Nodes), + // so a single namespace-scoped issue index zeroes issueCount on + // cluster-scoped hits whose problems live at namespace="". The + // builder routes per-hit by scope; CanReadClusterScoped above + // already gates which cluster-scoped kinds are reachable. if input.Context != "none" { - if builder := newSummaryContextBuilder(scanNamespaces, ""); builder != nil { + if builder := newSearchSummaryContextBuilder(scanNamespaces); builder != nil { opts.SummaryBuilder = search.SummaryBuilderFunc(builder) } } diff --git a/internal/mcp/tools_filter_test.go b/internal/mcp/tools_filter_test.go index 115a8a1dc..20c5c7227 100644 --- a/internal/mcp/tools_filter_test.go +++ b/internal/mcp/tools_filter_test.go @@ -148,6 +148,46 @@ func containsName(payload, name string) bool { return strings.Contains(payload, `"name":"`+name+`"`) } +// TestHandleListResources_GroupRoutesToDynamic pins the group-aware +// short-circuit on the MCP list_resources path. For kind=services with +// no group, the typed core Service list returns the seeded fixture. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// listDynamicResources instead. Mirrors the REST-side fix in +// handleAIListResources and the GET-side fix from PR #721. +// +// setupFakeCacheForFilterTests doesn't initialize the dynamic cache, so +// the dynamic call surfaces an error. listDynamicResources wraps it in +// "failed to list %s: …" — pin both that the result does NOT contain +// the core Service AND that the call returned the dynamic-cache error +// (proving the routing change is in place). +func TestHandleListResources_GroupRoutesToDynamic(t *testing.T) { + setupFakeCacheForFilterTests(t) + ctx := withRestrictedUser(t, "alice", []string{"alpha"}) + + // With no group: typed cache, but no Services in the fixture so + // it's an empty list. Sanity check the baseline. + _, _, err := handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha"}) + if err != nil { + t.Fatalf("baseline (no group): %v", err) + } + + // With group=serving.knative.dev: must route to dynamic. The fake + // cache has no dynamic discovery wired, so we expect an error + // rather than a (wrong) 200 with typed core Services. + _, _, err = handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha", Group: "serving.knative.dev"}) + if err == nil { + t.Fatalf("group=serving.knative.dev: expected dynamic-cache routing error (no discovery in test harness), got nil err — handler may have silently returned typed core Services (pre-fix bug)") + } + // The wrapped error should reflect the dynamic path, not a typed + // cache lookup. Match loosely on shape so future error-text + // refactors don't flake the test. + if !strings.Contains(err.Error(), "services") { + t.Errorf("error should mention services kind: %v", err) + } +} + func TestHandleListResources_RestrictedUser(t *testing.T) { setupFakeCacheForFilterTests(t) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 7c136ff50..148bcf4be 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -62,7 +62,19 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service, KEDA's HPA-like + // kinds) reach the right resource. FetchResourceList is group-blind + // — it would silently return the core typed list, dropping the + // query's group filter on the floor. Mirrors the same group-aware + // short-circuit in handleGetResource (PR #721). + if group != "" { + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) + return + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, namespaces) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go index b55d984cf..3747912dd 100644 --- a/internal/server/ai_handlers_rbac_test.go +++ b/internal/server/ai_handlers_rbac_test.go @@ -69,6 +69,68 @@ func TestAI_NamespacesList_NoListNamespacesSAR_Returns403(t *testing.T) { } } +// TestAI_ListServices_WithGroup_RoutesToDynamicCache pins the group-aware +// short-circuit in handleAIListResources. For kind=services with no group, +// the typed core Service list path returns the seeded nginx Service. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// aiListDynamic instead. Mirrors the same fix on GET in PR #721. +// +// The smoke TestMain seeds typed caches only; the dynamic resource cache +// isn't initialized, so the dynamic path surfaces a 500 with "resource +// discovery not initialized". That 500 IS the assertion: pre-fix the +// handler would return 200 with the core Service rows (silent +// wrong-kind result), which is the bug. +func TestAI_ListServices_WithGroup_RoutesToDynamicCache(t *testing.T) { + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + // Baseline: no group → typed cache returns the seeded core Service. + respCore := env.authGet(t, "/api/ai/resources/services?namespace=default", "bob", "") + defer respCore.Body.Close() + if respCore.StatusCode != http.StatusOK { + t.Fatalf("baseline (no group): expected 200, got %d", respCore.StatusCode) + } + var coreRows []map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreRows); err != nil { + t.Fatalf("decode core: %v", err) + } + var foundNginxSvc bool + for _, row := range coreRows { + if row["kind"] == "Service" && row["name"] == "nginx" { + foundNginxSvc = true + break + } + } + if !foundNginxSvc { + t.Fatalf("baseline (no group): expected nginx Service in typed list, got %+v", coreRows) + } + + // With group: must route through aiListDynamic. Dynamic cache isn't + // initialized in the smoke harness, so we expect either 400 ("unknown + // resource kind") or 500 ("dynamic resource cache not initialized" / + // "resource discovery not initialized") — anything BUT a 200 with + // core Services, which is the pre-fix wrong-result path. + respCRD := env.authGet(t, "/api/ai/resources/services?namespace=default&group=serving.knative.dev", "bob", "") + defer respCRD.Body.Close() + if respCRD.StatusCode == http.StatusOK { + var crdRows []map[string]any + if err := json.NewDecoder(respCRD.Body).Decode(&crdRows); err == nil { + for _, row := range crdRows { + if row["name"] == "nginx" { + t.Fatalf("group=serving.knative.dev leaked typed core Service into result (pre-fix bug): row=%+v", row) + } + } + } + } + if respCRD.StatusCode != http.StatusBadRequest && respCRD.StatusCode != http.StatusInternalServerError && respCRD.StatusCode != http.StatusOK { + t.Fatalf("group=serving.knative.dev: unexpected status %d (want 400/500 from uninitialized dynamic cache, or 200 with non-core rows)", respCRD.StatusCode) + } +} + func TestAI_DeploymentsList_HappyPath_AttachesSummaryContext(t *testing.T) { // Allowed user, summary-verbosity default. The envelope must // include the seeded nginx deployment AND each row must carry a diff --git a/internal/server/search_handler.go b/internal/server/search_handler.go index cb8eb91ad..7e5f56f4e 100644 --- a/internal/server/search_handler.go +++ b/internal/server/search_handler.go @@ -106,8 +106,16 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { // the per-request closure once (one Compose call + cached topology // snapshot) and let the search executor invoke it per kept hit. // ?context=none opts out so legacy callers don't pay for the join. + // + // Search uses the dual-index variant: hits are mixed-kind in one + // response (namespaced Pods alongside cluster-scoped Nodes), so a + // single-namespace-scoped issue index would zero issueCount on + // cluster-scoped hits (whose issues live at namespace=""). The + // builder routes per-hit by scope. SAR gating above + // (CanReadClusterScoped) already constrains which cluster-scoped + // kinds are reachable. if r.URL.Query().Get("context") != "none" { - if builder := s.newSummaryContextBuilder(scanNamespaces, ""); builder != nil { + if builder := s.newSearchSummaryContextBuilder(scanNamespaces); builder != nil { opts.SummaryBuilder = search.SummaryBuilderFunc(builder) } } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index 953174422..6ea894543 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -43,13 +43,15 @@ type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured // is scoped to just those rows (the full Compose call on a 100-namespace // cluster is fine; this is mostly belt-and-suspenders for very large // envs). Pass nil to compose cluster-wide. +// +// Use newSearchSummaryContextBuilder for search, which routes per-hit +// between a namespaced and a cluster-wide index — search returns mixed +// kinds in one response, so a single index can't get both right. func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { - topo := s.broadcaster.GetCachedTopology() provider := issues.NewCacheProvider() if provider == nil { return nil } - // One pass over the issue engine; group by group/kind/ns/name. We // rely on Filters.IncludeAudit and Filters.IncludeEvents staying // false-by-default in buildIssueIndex — that's what keeps the @@ -58,6 +60,49 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string // the per-row count would distort "this Pod has 1 issue" for the // common case. idx := buildIssueIndex(provider, namespaces, kindFilter) + return s.summaryContextBuilderFromIndexes(idx, idx) +} + +// newSearchSummaryContextBuilder is the search-specific variant. Search +// hits are MIXED-kind in one response — a single query can return both +// namespaced Pods and cluster-scoped Nodes. A single issue index can't +// be both: scoped to the user's namespaces it would silently zero +// issueCount on Node/PV/cluster-scoped CRD hits (whose issues live at +// namespace=""); composed cluster-wide it would over-count or pull in +// rows the namespace-restricted user shouldn't see. +// +// Fix: build two indexes per request. namespacedIdx is scoped to +// scanNamespaces (intersection of user RBAC and the query's `ns:` +// modifier). clusterIdx is composed cluster-wide (nil filter) so +// namespace="" issues surface. The returned closure dispatches per-hit +// via k8s.ClassifyKindScope(kind, group). Search-level RBAC +// (CanReadClusterScoped) already gated which cluster-scoped kinds the +// user can see, so the cluster-wide index doesn't expose unauthorized +// rows. +// +// The cluster-wide index is skipped when scanNamespaces is already nil +// (cluster-wide user) — both indexes would be identical, so one pass +// suffices. +func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summaryContextBuilder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := buildIssueIndex(provider, scanNamespaces, "") + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = buildIssueIndex(provider, nil, "") + } + return s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) +} + +// summaryContextBuilderFromIndexes is the shared closure body for the +// list and search variants. namespacedIdx is used for namespaced hits; +// clusterIdx is used for cluster-scoped hits. The list path passes the +// same index for both (single-kind list, scope already chosen by the +// caller); search passes two distinct indexes. +func (s *Server) summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx issueIndex) summaryContextBuilder { + topo := s.broadcaster.GetCachedTopology() resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) @@ -91,6 +136,15 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string if source == nil && u != nil { source = u } + // Dispatch by scope: cluster-scoped hits read from clusterIdx + // (composed at namespace=nil so namespace="" issues are present), + // namespaced hits read from namespacedIdx (which honors the + // user's namespace filter so the per-row count doesn't pull in + // noise from namespaces the user can't see). + idx := namespacedIdx + if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { + idx = clusterIdx + } return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ ManagedBy: managedBy, IssueCount: idx.count(group, kind, namespace, name), diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 6b84a5bcb..9e4e18f89 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -446,6 +446,112 @@ func TestIssueIndexNamespaces_ClusterScopedDropsFilter(t *testing.T) { } } +// TestSummaryContextBuilderFromIndexes_DispatchesByScope pins the +// dual-index dispatch: cluster-scoped hits (Node, PV, …) read the +// cluster-wide index (where namespace="" issues live), namespaced hits +// (Pod, Deployment, …) read the namespace-scoped index. Without this +// dispatch, a search response that mixes Pods and Nodes silently zeros +// issueCount on the Node hits — the namespace-scoped index drops every +// namespace="" issue. +// +// This pins what the search-handler-level fix relies on: the two +// indexes must be wired to the two scopes via the closure, not the +// other way around. A wiring inversion (cluster-scoped → namespaced +// index) would re-introduce the bug. +func TestSummaryContextBuilderFromIndexes_DispatchesByScope(t *testing.T) { + // Build two distinct indexes so we can tell which one was consulted. + // The cluster index sees a Node issue at namespace=""; the + // namespaced index has a Pod issue in "prod". An index leak would + // surface either the Node count under Pod or vice versa. + namespacedIdx := issueIndex{} + namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 + + clusterIdx := issueIndex{} + clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 + + // Server with a no-op broadcaster — the builder reads + // GetCachedTopology() which returns nil when no topology has been + // built; managedBy will be nil but issueCount dispatch is what we're + // pinning here. + s := &Server{broadcaster: NewSSEBroadcaster()} + build := s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) + + // Cluster-scoped Node hit — must read clusterIdx. + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { + t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) + } + // Namespaced Pod hit — must read namespacedIdx. + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { + t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) + } + // A cluster-scoped hit whose name only lives in the namespaced + // index must return 0 (no cross-bucket leak). + if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) + } + // And a namespaced hit whose name only lives in the cluster index + // likewise returns 0. + if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) + } +} + +// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end +// shape used by /api/search and MCP search: scanNamespaces is non-nil +// (a namespace-restricted user, or a user with a `ns:` query modifier), +// so the constructor must compose TWO issue indexes — one scoped to +// those namespaces, one cluster-wide for cluster-scoped hits. Without +// the second index, the Node hit's summaryContext.issueCount returns +// 0 because every Node issue lives at namespace="" and the namespace +// filter drops them. +// +// Exercise via the issues.Provider seam: a fakeIssuesProvider that +// emits one Node problem at namespace="" and one Pod problem in +// "prod". With scanNamespaces=["prod"], the Node count must still +// surface (proving the cluster-wide index was built and routed to). +func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8sProblem{ + {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, + }, + } + + // Build the two indexes the constructor would build. + namespacedIdx := buildIssueIndex(p, []string{"prod"}, "") + clusterIdx := buildIssueIndex(p, nil, "") + + // Sanity: pre-fix, the search handler passed namespacedIdx for + // both; Node issueCount silently zeroed. + if got := namespacedIdx.count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespacedIdx Node count = %d, want 0 (sanity — namespace filter drops cluster-scoped issues)", got) + } + if got := clusterIdx.count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("clusterIdx Node count = %d, want 1 (cluster-wide compose surfaces namespace=\"\" issues)", got) + } + + // And the namespaced Pod issue must surface in the namespaced + // index — search RBAC has already gated namespace visibility, so + // the per-row count should respect the scan boundary instead of + // composing cluster-wide and pulling in noise from other + // namespaces. + if got := namespacedIdx.count("", "Pod", "prod", "api-7"); got != 1 { + t.Errorf("namespacedIdx Pod count = %d, want 1", got) + } + + // With both indexes built, the closure dispatches per-hit by + // scope. Replay the dispatch via the shared helper to pin the + // end-to-end shape. + s := &Server{broadcaster: NewSSEBroadcaster()} + build := s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) + } +} + // TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the // end-to-end behavior the issueIndexNamespaces helper enables: when the // builder passes nil for the namespace filter (cluster-scoped kind), From e000dcb1c1e0b0e25f226666d054f1293fcc256e Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 14:51:33 +0300 Subject: [PATCH 15/33] fix(summaryContext): use Spec.Replicas + defer SummaryBuilder past truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two Bugbot findings on PR #722: 1. deriveHealth for Deployment and ReplicaSet compared ReadyReplicas against Status.Replicas (current pod count) instead of *Spec.Replicas (desired). During scale-down or rolling updates Status.Replicas can exceed Spec — surplus pods draining — and ReadyReplicas trails. The previous code reported "degraded" even when all DESIRED replicas were ready, which is the steady-state-healthy condition. StatefulSet/DaemonSet cases were already correct. Two new regression tests pin the scale-down scenario. 2. search.go::Search called SummaryBuilder inline inside buildHit for every matched candidate, BEFORE sort + truncate to opts.Limit. A broad query matching thousands of objects paid topology lookups for all of them, shipped at most Limit=50. Refactored to buffer hits + their source obj in a private pendingHit slice, sort and truncate that, then run SummaryBuilder only on the kept hits. The deferred topology lookups now scale with output size, not match size. --- internal/search/search.go | 74 +++++++++++++++++++++-------- pkg/resourcecontext/summary.go | 49 ++++++++++++------- pkg/resourcecontext/summary_test.go | 65 +++++++++++++++++++++---- 3 files changed, 145 insertions(+), 43 deletions(-) diff --git a/internal/search/search.go b/internal/search/search.go index 0cd9bf624..94fe72b9a 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -26,8 +26,8 @@ import ( ) // SummaryBuilderFunc, when supplied via Options.SummaryBuilder, is -// invoked once per matched hit to produce the compact per-row -// SummaryContext attached to the Hit. Exactly one of obj/u will be +// invoked once per matched hit to produce the compact ResourceSummaryContext +// attached to the hit's summaryContext field. Exactly one of obj/u will be // non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil // is fine (the field is omitempty); callers use it to gate context // emission per request (context=none opts out by passing nil here). @@ -37,7 +37,7 @@ import ( // it through lets the builder distinguish CRDs that share // kind+namespace+name across groups (e.g. Knative Service vs corev1 // Service) in its per-resource issue index. -type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext // Provider abstracts the cache so tests can inject a fake. type Provider interface { @@ -122,7 +122,7 @@ type Options struct { // just runs the program. Filter *CELFilter // SummaryBuilder, when non-nil, is invoked per matched hit to - // attach the compact SummaryContext (managedBy + health + + // attach the compact summaryContext (managedBy + health + // issueCount). Handlers provide a closure that wraps the // request-scoped topology + per-namespace issue index so the // per-row cost stays flat. Pass nil to opt out (context=none) — @@ -131,6 +131,17 @@ type Options struct { } // Search runs the parsed query against the provider and returns ranked hits. +// pendingHit pairs a Hit with the source object that produced it, so the +// SummaryBuilder (topology lookups, issue-index reads) can be deferred +// until AFTER the hits are sorted and truncated to opts.Limit. Lifecycle is +// strictly internal to Search — never escapes the function. +type pendingHit struct { + hit Hit + obj runtime.Object // typed source (nil for CRD hits) + u *unstructured.Unstructured // unstructured source (nil for typed hits) + c candidate // for c.Group/Kind/Namespace/Name when invoking SummaryBuilder +} + func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, error) { if opts.Limit <= 0 { opts.Limit = DefaultLimit @@ -140,7 +151,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } var res Result - var hits []Hit + // Buffer hits along with the source object so summaryBuilder (topology + // lookups, issue-index reads) can run AFTER sort + truncate — without + // this, broad queries pay topology lookups for thousands of matches + // only to ship at most opts.Limit of them. + var pending []pendingHit // CEL filter eval errors are silently dropped per-row (the agent // just gets fewer hits, no 500), but we log the first error so an // operator can see when rows are dying to runtime issues — typical @@ -226,7 +241,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, obj, nil, opts.SummaryBuilder)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, c, opts.Include, obj, nil, nil), + obj: obj, + c: c, + }) } } @@ -292,7 +311,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, nil, u, opts.SummaryBuilder)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, c, opts.Include, nil, u, nil), + u: u, + c: c, + }) } } @@ -304,21 +327,34 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } - sort.SliceStable(hits, func(i, j int) bool { - if hits[i].Score != hits[j].Score { - return hits[i].Score > hits[j].Score + sort.SliceStable(pending, func(i, j int) bool { + if pending[i].hit.Score != pending[j].hit.Score { + return pending[i].hit.Score > pending[j].hit.Score } - if hits[i].Kind != hits[j].Kind { - return hits[i].Kind < hits[j].Kind + if pending[i].hit.Kind != pending[j].hit.Kind { + return pending[i].hit.Kind < pending[j].hit.Kind } - if hits[i].Namespace != hits[j].Namespace { - return hits[i].Namespace < hits[j].Namespace + if pending[i].hit.Namespace != pending[j].hit.Namespace { + return pending[i].hit.Namespace < pending[j].hit.Namespace } - return hits[i].Name < hits[j].Name + return pending[i].hit.Name < pending[j].hit.Name }) - res.TotalMatched = len(hits) - if len(hits) > opts.Limit { - hits = hits[:opts.Limit] + res.TotalMatched = len(pending) + if len(pending) > opts.Limit { + pending = pending[:opts.Limit] + } + + // Summary attach happens HERE — after truncation — so the topology + // lookups + issue-index reads only run for the hits we'll actually + // ship. Skipped entirely when SummaryBuilder is nil (caller opted out + // via context=none). + hits := make([]Hit, len(pending)) + for i := range pending { + hits[i] = pending[i].hit + if opts.SummaryBuilder != nil { + c := pending[i].c + hits[i].SummaryContext = opts.SummaryBuilder(pending[i].obj, pending[i].u, c.Group, c.Kind, c.Namespace, c.Name) + } } res.Hits = hits res.Total = len(hits) @@ -368,7 +404,7 @@ func isClusterScopedKind(kind string) bool { // buildHit assembles the response shape for a matched candidate. Exactly // one of obj/u will be non-nil. minify-on-demand keeps the cost of // IncludeNone (identity-only) flat. summaryBuilder, when non-nil, is -// invoked to attach the compact per-row SummaryContext — kept separate +// invoked to attach the compact per-result summaryContext — kept separate // from Include because context applies to every verbosity (including // IncludeNone identity-only hits), while Summary/Raw control the full // minified body. diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go index cc1190a20..793e24fb0 100644 --- a/pkg/resourcecontext/summary.go +++ b/pkg/resourcecontext/summary.go @@ -9,14 +9,14 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) -// SummaryOptions configures the per-row enrichment produced by +// SummaryOptions configures the compact per-result enrichment produced by // BuildSummary. All fields are pre-computed by the caller — this // package never touches the issue engine, topology builder, or audit // cache directly. Handlers in internal/* (REST list, MCP list_resources, // search) walk the per-request topology + issue indexes once and pass -// the per-row digest in here. +// the per-result digest in here. type SummaryOptions struct { - // ManagedBy is the compact owner/GitOps pointer attached to the row. + // ManagedBy is the compact owner/GitOps pointer attached to the summary. // Callers derive this from topology.Relationships via // ManagedByFromOwner; nil leaves the field absent. ManagedBy *ManagedByRef @@ -24,7 +24,7 @@ type SummaryOptions struct { // IssueCount is the count of internal issue-engine findings scoped to // the subject resource. Callers pre-compute a per-namespace index // (e.g. via internal/issues.ComposeWithStats) once per request and - // pass the count in for each row. Zero omits the field. + // pass the count in for each result. Zero omits the field. IssueCount int // Health, when non-empty, overrides the derived health string. The @@ -34,13 +34,13 @@ type SummaryOptions struct { Health string } -// BuildSummary produces the compact per-row SummaryContext attached to +// BuildSummary produces the compact per-result summaryContext attached to // list_resources, /api/ai/resources/{kind} list, and search hits. // -// Tightly bounded — targets ≤ 60 bytes per row when present. Returns -// nil when all three fields would be empty so callers can `omitempty` -// the entire object on bare rows and keep the wire shape minimal. -func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { +// Tightly bounded — only the triage fields needed to choose a next hop. +// Returns nil when all three fields would be empty so callers can +// `omitempty` the entire object on bare results and keep the wire shape minimal. +func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryContext { health := opts.Health if health == "" { health = deriveHealth(obj) @@ -48,7 +48,7 @@ func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { return nil } - return &SummaryContext{ + return &ResourceSummaryContext{ ManagedBy: opts.ManagedBy, Health: health, IssueCount: opts.IssueCount, @@ -72,10 +72,13 @@ func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) return nil } return &ManagedByRef{ - Kind: ownerKind, - Source: sourceForOwner(ownerKind, ownerGroup), - Name: ownerName, - Namespace: ownerNamespace, + Source: sourceForOwner(ownerKind, ownerGroup), + Ref: ResourceSummaryRef{ + Kind: ownerKind, + Group: ownerGroup, + Namespace: ownerNamespace, + Name: ownerName, + }, } } @@ -112,7 +115,16 @@ func deriveHealth(obj runtime.Object) string { case *corev1.Pod: return podHealth(o) case *appsv1.Deployment: - return replicasHealth(o.Status.ReadyReplicas, o.Status.Replicas) + // Use Spec.Replicas (desired) not Status.Replicas (current). During + // scale-down or rolling updates, Status.Replicas can exceed + // Spec.Replicas while terminating pods drain; comparing ReadyReplicas + // against Status.Replicas would falsely report "degraded" when all + // desired replicas are actually ready. Matches StatefulSet semantics. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) case *appsv1.StatefulSet: desired := int32(1) if o.Spec.Replicas != nil { @@ -122,7 +134,12 @@ func deriveHealth(obj runtime.Object) string { case *appsv1.DaemonSet: return replicasHealth(o.Status.NumberReady, o.Status.DesiredNumberScheduled) case *appsv1.ReplicaSet: - return replicasHealth(o.Status.ReadyReplicas, o.Status.Replicas) + // Same Spec-vs-Status concern as Deployment above. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) case *unstructured.Unstructured: return unstructuredHealth(o) } diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go index 07836ad7b..69ecfccb4 100644 --- a/pkg/resourcecontext/summary_test.go +++ b/pkg/resourcecontext/summary_test.go @@ -86,7 +86,7 @@ func TestBuildSummary_PodGoldens(t *testing.T) { ManagedBy: ManagedByFromOwner("ReplicaSet", "apps", "prod", "api-7d5"), IssueCount: 2, }, - want: `{"managedBy":{"kind":"ReplicaSet","source":"native","name":"api-7d5","namespace":"prod"},"health":"healthy","issueCount":2}`, + want: `{"managedBy":{"source":"native","ref":{"kind":"ReplicaSet","group":"apps","namespace":"prod","name":"api-7d5"}},"health":"healthy","issueCount":2}`, }, } for _, c := range cases { @@ -122,10 +122,17 @@ func TestBuildSummary_DeploymentReplicasHealth(t *testing.T) { } for _, c := range cases { t.Run(c.name, func(t *testing.T) { + desired := c.desired dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &desired, // desired is Spec.Replicas (not Status) — see deriveHealth + }, Status: appsv1.DeploymentStatus{ ReadyReplicas: c.ready, - Replicas: c.desired, + // Status.Replicas mirrors the actual non-terminated pod count + // in real clusters; we set it equal to ready here so the + // fixture matches a steady-state Deployment for that test. + Replicas: c.ready, }, } got := BuildSummary(dep, SummaryOptions{}) @@ -146,6 +153,48 @@ func TestBuildSummary_DeploymentReplicasHealth(t *testing.T) { } } +// TestBuildSummary_DeploymentHealthDuringScaleDown pins the Spec-vs-Status +// regression flagged on PR #722: during rolling updates or scale-down, +// Status.Replicas (current pod count) can exceed Spec.Replicas (desired). +// Before the fix, deriveHealth compared ReadyReplicas against Status.Replicas +// and reported "degraded" because not all current pods were ready — even +// though all DESIRED replicas were ready and the cluster was healthily +// draining excess pods. Use Spec.Replicas as the denominator instead. +func TestBuildSummary_DeploymentHealthDuringScaleDown(t *testing.T) { + desired := int32(2) + dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{Replicas: &desired}, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 2, // all DESIRED replicas are ready + Replicas: 4, // but 2 extras still terminating from a scale-down + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if got == nil { + t.Fatal("got nil, want SummaryContext with health=healthy") + } + if got.Health != "healthy" { + t.Errorf("Health = %q, want %q (Spec.Replicas=2 ready, Status.Replicas=4 due to draining)", got.Health, "healthy") + } +} + +// TestBuildSummary_ReplicaSetHealthDuringScaleDown pins the same fix for +// ReplicaSet — the Deployment regression also applied here. +func TestBuildSummary_ReplicaSetHealthDuringScaleDown(t *testing.T) { + desired := int32(3) + rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{Replicas: &desired}, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 3, + Replicas: 5, + }, + } + got := BuildSummary(rs, SummaryOptions{}) + if got == nil || got.Health != "healthy" { + t.Errorf("ReplicaSet during scale-down: got %+v, want Health=healthy", got) + } +} + // TestBuildSummary_NetworkPolicy verifies BuildSummary handles a kind // without a health heuristic — it should only emit fields the caller // supplied (e.g. issueCount, managedBy) and skip health entirely. @@ -266,7 +315,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "apps", namespace: "prod", ownerName: "api", - want: &ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + want: &ManagedByRef{Source: "native", Ref: ResourceSummaryRef{Kind: "Deployment", Group: "apps", Namespace: "prod", Name: "api"}}, }, { name: "argocd_application", @@ -274,7 +323,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "argoproj.io", namespace: "argocd", ownerName: "storefront", - want: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, + want: &ManagedByRef{Source: "argocd", Ref: ResourceSummaryRef{Kind: "Application", Group: "argoproj.io", Namespace: "argocd", Name: "storefront"}}, }, { name: "flux_kustomization", @@ -282,7 +331,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "kustomize.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "prod-apps", - want: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "Kustomization", Group: "kustomize.toolkit.fluxcd.io", Namespace: "flux-system", Name: "prod-apps"}}, }, { name: "flux_helmrelease", @@ -290,7 +339,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "helm.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "prod-apps", - want: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "HelmRelease", Group: "helm.toolkit.fluxcd.io", Namespace: "flux-system", Name: "prod-apps"}}, }, { name: "flux_gitrepository", @@ -298,7 +347,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "source.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "repo", - want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, + want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "GitRepository", Group: "source.toolkit.fluxcd.io", Namespace: "flux-system", Name: "repo"}}, }, { // Native Helm release: topology's detectManagedByFromMeta emits @@ -313,7 +362,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "", namespace: "cert-manager", ownerName: "cert-manager", - want: &ManagedByRef{Kind: "HelmRelease", Source: "helm", Name: "cert-manager", Namespace: "cert-manager"}, + want: &ManagedByRef{Source: "helm", Ref: ResourceSummaryRef{Kind: "HelmRelease", Namespace: "cert-manager", Name: "cert-manager"}}, }, } for _, c := range cases { From a18a9f3243d27831443d2dc985acbc89dcaab51b Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 15:23:40 +0300 Subject: [PATCH 16/33] chore(search): drop dead summaryBuilder parameter from buildHit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the deferred-summary refactor, every buildHit call site passes nil for summaryBuilder; the actual attachment happens in Search's post- truncation loop. The parameter + the `if summaryBuilder != nil` branch inside buildHit are dead code and a maintenance trap — a future caller passing non-nil would defeat the post-truncation optimization the refactor exists to enable. Drop the parameter, remove the inner branch, and tighten the doc to explicitly say SummaryContext attachment is Search's responsibility after sort + Limit truncation. No behavior change; existing tests cover both code paths. --- internal/search/search.go | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/internal/search/search.go b/internal/search/search.go index 94fe72b9a..09a6cf263 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -242,7 +242,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } pending = append(pending, pendingHit{ - hit: buildHit(score, matched, c, opts.Include, obj, nil, nil), + hit: buildHit(score, matched, c, opts.Include, obj, nil), obj: obj, c: c, }) @@ -312,7 +312,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } pending = append(pending, pendingHit{ - hit: buildHit(score, matched, c, opts.Include, nil, u, nil), + hit: buildHit(score, matched, c, opts.Include, nil, u), u: u, c: c, }) @@ -403,12 +403,11 @@ func isClusterScopedKind(kind string) bool { // buildHit assembles the response shape for a matched candidate. Exactly // one of obj/u will be non-nil. minify-on-demand keeps the cost of -// IncludeNone (identity-only) flat. summaryBuilder, when non-nil, is -// invoked to attach the compact per-result summaryContext — kept separate -// from Include because context applies to every verbosity (including -// IncludeNone identity-only hits), while Summary/Raw control the full -// minified body. -func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured, summaryBuilder SummaryBuilderFunc) Hit { +// IncludeNone (identity-only) flat. SummaryContext attachment is NOT +// done here — it happens in Search's post-truncation loop so the +// expensive topology lookups + issue-index reads only run for the hits +// that survive sort + Limit truncation. +func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { h := Hit{ Score: score, Kind: c.Kind, @@ -437,8 +436,5 @@ func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, h.Raw = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) } } - if summaryBuilder != nil { - h.SummaryContext = summaryBuilder(obj, u, c.Group, c.Kind, c.Namespace, c.Name) - } return h } From c51a1a44769e46d3b9e861adea250c24f9ff4f96 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 16:04:50 +0300 Subject: [PATCH 17/33] fix(resourcecontext): restore SummaryContext type after stray rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9ba47cb landed a partial rename of SummaryContext → ResourceSummaryContext and ManagedByRef.Ref → ResourceSummaryRef in summary.go / summary_test.go / search.go, but types.go still defines the flat SummaryContext + ManagedByRef shape that every consumer in internal/server, internal/mcp, internal/search, and pkg/ai/context references. The half-applied rename left the tree in a broken-build state — `go build ./...` failed with `undefined: ResourceSummaryContext` and `unknown field Ref in struct literal of type ManagedByRef`. Restore the original SummaryContext / ManagedByRef shape in summary.go + search.go + the summary_test.go golden. Spec.Replicas health fix from 9ba47cb stays. --- internal/search/search.go | 4 ++-- pkg/resourcecontext/summary.go | 15 ++++++--------- pkg/resourcecontext/summary_test.go | 14 +++++++------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/internal/search/search.go b/internal/search/search.go index 09a6cf263..f7b352809 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -26,7 +26,7 @@ import ( ) // SummaryBuilderFunc, when supplied via Options.SummaryBuilder, is -// invoked once per matched hit to produce the compact ResourceSummaryContext +// invoked once per matched hit to produce the compact SummaryContext // attached to the hit's summaryContext field. Exactly one of obj/u will be // non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil // is fine (the field is omitempty); callers use it to gate context @@ -37,7 +37,7 @@ import ( // it through lets the builder distinguish CRDs that share // kind+namespace+name across groups (e.g. Knative Service vs corev1 // Service) in its per-resource issue index. -type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext // Provider abstracts the cache so tests can inject a fake. type Provider interface { diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go index 793e24fb0..86ec0ad9e 100644 --- a/pkg/resourcecontext/summary.go +++ b/pkg/resourcecontext/summary.go @@ -40,7 +40,7 @@ type SummaryOptions struct { // Tightly bounded — only the triage fields needed to choose a next hop. // Returns nil when all three fields would be empty so callers can // `omitempty` the entire object on bare results and keep the wire shape minimal. -func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryContext { +func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { health := opts.Health if health == "" { health = deriveHealth(obj) @@ -48,7 +48,7 @@ func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryConte if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { return nil } - return &ResourceSummaryContext{ + return &SummaryContext{ ManagedBy: opts.ManagedBy, Health: health, IssueCount: opts.IssueCount, @@ -72,13 +72,10 @@ func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) return nil } return &ManagedByRef{ - Source: sourceForOwner(ownerKind, ownerGroup), - Ref: ResourceSummaryRef{ - Kind: ownerKind, - Group: ownerGroup, - Namespace: ownerNamespace, - Name: ownerName, - }, + Kind: ownerKind, + Source: sourceForOwner(ownerKind, ownerGroup), + Name: ownerName, + Namespace: ownerNamespace, } } diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go index 69ecfccb4..cd0045010 100644 --- a/pkg/resourcecontext/summary_test.go +++ b/pkg/resourcecontext/summary_test.go @@ -86,7 +86,7 @@ func TestBuildSummary_PodGoldens(t *testing.T) { ManagedBy: ManagedByFromOwner("ReplicaSet", "apps", "prod", "api-7d5"), IssueCount: 2, }, - want: `{"managedBy":{"source":"native","ref":{"kind":"ReplicaSet","group":"apps","namespace":"prod","name":"api-7d5"}},"health":"healthy","issueCount":2}`, + want: `{"managedBy":{"kind":"ReplicaSet","source":"native","name":"api-7d5","namespace":"prod"},"health":"healthy","issueCount":2}`, }, } for _, c := range cases { @@ -315,7 +315,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "apps", namespace: "prod", ownerName: "api", - want: &ManagedByRef{Source: "native", Ref: ResourceSummaryRef{Kind: "Deployment", Group: "apps", Namespace: "prod", Name: "api"}}, + want: &ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, }, { name: "argocd_application", @@ -323,7 +323,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "argoproj.io", namespace: "argocd", ownerName: "storefront", - want: &ManagedByRef{Source: "argocd", Ref: ResourceSummaryRef{Kind: "Application", Group: "argoproj.io", Namespace: "argocd", Name: "storefront"}}, + want: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, }, { name: "flux_kustomization", @@ -331,7 +331,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "kustomize.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "prod-apps", - want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "Kustomization", Group: "kustomize.toolkit.fluxcd.io", Namespace: "flux-system", Name: "prod-apps"}}, + want: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, }, { name: "flux_helmrelease", @@ -339,7 +339,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "helm.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "prod-apps", - want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "HelmRelease", Group: "helm.toolkit.fluxcd.io", Namespace: "flux-system", Name: "prod-apps"}}, + want: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, }, { name: "flux_gitrepository", @@ -347,7 +347,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "source.toolkit.fluxcd.io", namespace: "flux-system", ownerName: "repo", - want: &ManagedByRef{Source: "flux", Ref: ResourceSummaryRef{Kind: "GitRepository", Group: "source.toolkit.fluxcd.io", Namespace: "flux-system", Name: "repo"}}, + want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, }, { // Native Helm release: topology's detectManagedByFromMeta emits @@ -362,7 +362,7 @@ func TestManagedByFromOwner(t *testing.T) { group: "", namespace: "cert-manager", ownerName: "cert-manager", - want: &ManagedByRef{Source: "helm", Ref: ResourceSummaryRef{Kind: "HelmRelease", Namespace: "cert-manager", Name: "cert-manager"}}, + want: &ManagedByRef{Kind: "HelmRelease", Source: "helm", Name: "cert-manager", Namespace: "cert-manager"}, }, } for _, c := range cases { From f3c1acfd83256de58679084b8c3fa456f52ecd31 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 16:05:14 +0300 Subject: [PATCH 18/33] refactor(summaryContext): lift shared core into internal/summarycontext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The REST and MCP SummaryContext builders carried ~200 LoC of duplicated implementation — the issue-index map, group-aware key arithmetic, canonicalSingular kind normalization, BuildIssueIndex compose loop, managedByFromRelationships extraction, and the per-row dispatch closure body that picks namespaced vs cluster-wide index by scope. The only substantive difference was the topology source (REST: broadcaster cache; MCP: 5s memoizer). Lift the shared core into a new internal/summarycontext package that exports: - Builder (the per-request closure type) - IssueIndex with Count(group, kind, ns, name) int - CanonicalSingular(kind) string - BuildIssueIndex(provider, namespaces, kindFilter) IssueIndex - ManagedByFromRelationships(rel) *ManagedByRef - BuilderFromIndexes(topo, namespacedIdx, clusterIdx) Builder BuilderFromIndexes takes the topology snapshot as an argument so REST and MCP keep their respective sources unchanged — REST passes s.broadcaster.GetCachedTopology(), MCP passes the memoized build. internal/server/summary_context.go and internal/mcp/summary_context.go shrink to thin wrappers that own their topology source and forward everything else to the shared package. The MCP file additionally keeps its local summaryCtxTopoMemo + buildSummaryContextTopology helpers (MCP-specific, no analogue on the REST side). Pure-function tests (issue-index key arithmetic, BuildIssueIndex over a fake provider, CanonicalSingular, ManagedByFromRelationships, the dual-index dispatch shape) move into the new package alongside the shared code. The REST-side wiring tests (attachSummaryContextToList end-to-end, issueIndexNamespaces dispatch) stay in internal/server since they exercise handler plumbing. The MCP-side test file was pure-function-only and is deleted outright — coverage now lives in internal/summarycontext. Net: ~390 LoC dropped from the tree; no wire-shape or behavior changes. --- internal/mcp/summary_context.go | 206 +--------- internal/mcp/summary_context_test.go | 219 ----------- internal/mcp/tools.go | 5 +- internal/server/ai_handlers.go | 5 +- internal/server/summary_context.go | 226 +---------- internal/server/summary_context_test.go | 367 +----------------- internal/summarycontext/summarycontext.go | 221 +++++++++++ .../summarycontext/summarycontext_test.go | 334 ++++++++++++++++ 8 files changed, 596 insertions(+), 987 deletions(-) delete mode 100644 internal/mcp/summary_context_test.go create mode 100644 internal/summarycontext/summarycontext.go create mode 100644 internal/summarycontext/summarycontext_test.go diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 1e72b154c..f5af123df 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -1,33 +1,23 @@ // Per-request helpers that compute the compact SummaryContext attached -// to list_resources rows and search hits served via MCP. Mirrors the -// equivalent helpers in internal/server (REST list + search). Kept -// separate so MCP doesn't pull in the server package. +// to list_resources rows and search hits served via MCP. +// +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the MCP-specific wrapper — it +// sources topology from a short-TTL per-process memoizer (MCP has no +// shared broadcaster cache) and otherwise just plumbs arguments through. package mcp import ( - "strings" "time" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" - "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/internal/summarycontext" "github.com/skyhook-io/radar/pkg/topology" ) -// summaryContextBuilder is the per-request closure that produces a -// SummaryContext for a single resource. nil result is fine — the -// SummaryContext field is omitempty on every consumer. -// -// group is required so the per-resource issue lookup can distinguish -// CRDs that share kind+namespace+name across API groups (e.g. Knative -// Service vs corev1 Service, or two custom CRDs both named "Cluster" -// from different operators). Pass "" for core-group resources. -type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext - // newSummaryContextBuilder assembles the per-request closure for MCP // list_resources. Returns nil when the cache or topology isn't // available, in which case the caller should skip context attachment @@ -42,13 +32,13 @@ type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured // per-hit between a namespaced and a cluster-wide index — search // returns mixed kinds in one response, so a single index can't get // both right. -func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { +func newSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - idx := buildIssueIndex(provider, namespaces, kindFilter) - return summaryContextBuilderFromIndexes(namespaces, idx, idx) + idx := summarycontext.BuildIssueIndex(provider, namespaces, kindFilter) + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(namespaces), idx, idx) } // newSearchSummaryContextBuilder is the MCP search variant. Mirrors @@ -58,80 +48,17 @@ func newSummaryContextBuilder(namespaces []string, kindFilter string) summaryCon // canReadClusterScopedKind) already gates which cluster-scoped kinds // are reachable, so composing the cluster-wide index doesn't leak // rows the user can't see. -func newSearchSummaryContextBuilder(scanNamespaces []string) summaryContextBuilder { +func newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - namespacedIdx := buildIssueIndex(provider, scanNamespaces, "") + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces, "") clusterIdx := namespacedIdx if scanNamespaces != nil { - clusterIdx = buildIssueIndex(provider, nil, "") - } - return summaryContextBuilderFromIndexes(scanNamespaces, namespacedIdx, clusterIdx) -} - -// summaryContextBuilderFromIndexes is the shared closure body. The list -// path passes the same index for both args; search passes two distinct -// indexes (namespacedIdx scoped to user namespaces, clusterIdx composed -// cluster-wide). The closure dispatches per-hit by scope so cluster- -// scoped hits read the cluster-wide index and surface namespace="" -// issues that the namespaced filter would otherwise drop. -// -// topoNamespaces is the namespace hint for the topology build — -// search passes the same scanNamespaces it used for the namespaced -// index; list passes its allowed-namespace set. Topology snapshot is -// memoized; passing the same hint hits the cache across list and -// search invocations in a burst. -func summaryContextBuilderFromIndexes(topoNamespaces []string, namespacedIdx, clusterIdx issueIndex) summaryContextBuilder { - topo := buildSummaryContextTopology(topoNamespaces) - - resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) - dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) - - // One inverted-edges index per request — without it each - // GetRelationships call would re-scan topo.Edges in O(E), turning - // the list/search hot path into O(N × E). See pkg/topology T3. - var relIdx *topology.RelationshipsIndex - if topo != nil { - relIdx = topology.IndexByResource(topo) - } - - return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { - var managedBy *resourcecontext.ManagedByRef - if topo != nil { - // Pass the fetched object when available so synthesis is - // group-aware (avoids kind/plural collisions like Knative - // Service vs corev1 Service). Falls back to (kind, ns, name) - // lookup when neither obj nor u is set. - var rawObj any - switch { - case obj != nil: - rawObj = obj - case u != nil: - rawObj = u - } - rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) - managedBy = managedByFromRelationships(rel) - } - var source runtime.Object = obj - if source == nil && u != nil { - source = u - } - // Dispatch by scope: cluster-scoped hits read clusterIdx (composed - // at namespace=nil so namespace="" issues are present), namespaced - // hits read namespacedIdx (which honors the user's namespace - // filter so the per-row count doesn't pull in noise from - // namespaces the user can't see). - idx := namespacedIdx - if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { - idx = clusterIdx - } - return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ - ManagedBy: managedBy, - IssueCount: idx.count(group, kind, namespace, name), - }) + clusterIdx = summarycontext.BuildIssueIndex(provider, nil, "") } + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(scanNamespaces), namespacedIdx, clusterIdx) } // summaryCtxTopoMemo caches topology builds across summary-context list and @@ -169,106 +96,3 @@ func buildSummaryContextTopology(namespaces []string) *topology.Topology { } return topo } - -// issueIndex keys per-resource issue counts as "group|kind|namespace|name". -// Group goes FIRST so two CRDs sharing kind+namespace+name across API -// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, -// or two operators each shipping a "Cluster" CRD) get independent counts -// instead of inheriting each other's. Kind is canonicalized via -// canonicalSingular because issue sources emit the kind as-typed -// (Deployment) while callers may pass the URL plural (deployments) — -// canonicalization normalizes both. "|" can't appear in a Kubernetes API -// group (groups follow DNS subdomain rules), so it's a safe delimiter. -type issueIndex map[string]int - -func (i issueIndex) count(group, kind, namespace, name string) int { - return i[issueIndexKey(group, kind, namespace, name)] -} - -func issueIndexKey(group, kind, namespace, name string) string { - return group + "|" + strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name -} - -func canonicalSingular(kind string) string { - k := strings.ToLower(kind) - switch k { - case "pods": - return "pod" - case "services": - return "service" - case "deployments": - return "deployment" - case "daemonsets": - return "daemonset" - case "statefulsets": - return "statefulset" - case "replicasets": - return "replicaset" - case "jobs": - return "job" - case "cronjobs": - return "cronjob" - case "ingresses": - return "ingress" - case "configmaps": - return "configmap" - case "secrets": - return "secret" - case "persistentvolumeclaims": - return "persistentvolumeclaim" - case "persistentvolumes": - return "persistentvolume" - case "storageclasses": - return "storageclass" - case "horizontalpodautoscalers", "hpas", "hpa": - return "horizontalpodautoscaler" - case "poddisruptionbudgets": - return "poddisruptionbudget" - case "nodes": - return "node" - case "namespaces": - return "namespace" - case "events": - return "event" - } - return k -} - -func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { - // NoLimit (not MaxLimit) is required here: a 5000-issue cluster would - // otherwise truncate after the first 1000 sorted rows, silently - // zeroing issueCount for resources whose issues fall in the tail. - // We're bucketing for a per-resource lookup, not paginating — the - // caller of summaryContext never sees the issue list itself. - filters := issues.Filters{ - Namespaces: namespaces, - Limit: issues.NoLimit, - } - if kindFilter != "" { - filters.Kinds = []string{canonicalSingular(kindFilter)} - } - composed := issues.Compose(p, filters) - idx := make(issueIndex, len(composed)) - for _, iss := range composed { - idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ - } - return idx -} - -// managedByFromRelationships extracts a compact ManagedByRef from -// computed topology relationships. Preference: server-synthesized -// Relationships.ManagedBy (ArgoCD > Flux > Helm > topmost K8s owner), -// then direct Owner as fallback when synthesis declines. -func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { - if rel == nil { - return nil - } - if len(rel.ManagedBy) > 0 { - ref := rel.ManagedBy[0] - return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) - } - if rel.Owner != nil { - return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) - } - return nil -} diff --git a/internal/mcp/summary_context_test.go b/internal/mcp/summary_context_test.go deleted file mode 100644 index 38df5988b..000000000 --- a/internal/mcp/summary_context_test.go +++ /dev/null @@ -1,219 +0,0 @@ -// Mirror of internal/server/summary_context_test.go for the MCP path — -// pins the group-aware issue index key and the NoLimit fix so the MCP -// list_resources / search builders stay in lockstep with REST. - -package mcp - -import ( - "fmt" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - - "github.com/skyhook-io/radar/internal/issues" - "github.com/skyhook-io/radar/internal/k8s" - bp "github.com/skyhook-io/radar/pkg/audit" -) - -// fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex -// tests. Only the fields the index path touches are wired. -// -// DetectProblems mirrors CacheProvider.DetectProblems: empty namespaces -// returns the full set; a non-empty slice drops cluster-scoped rows -// (Namespace=="") to match the production flattenNamespacedProblems -// behavior — needed so the cluster-scoped-filter regression test can -// pin the actual bug. -type fakeIssuesProvider struct { - problems []k8s.Problem -} - -func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { - if len(namespaces) == 0 { - return f.problems - } - allowed := map[string]bool{} - for _, ns := range namespaces { - allowed[ns] = true - } - out := make([]k8s.Problem, 0, len(f.problems)) - for _, p := range f.problems { - if p.Namespace == "" { - continue - } - if allowed[p.Namespace] { - out = append(out, p) - } - } - return out -} -func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } -func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } -func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { - return nil -} -func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } -func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { - return nil, nil -} -func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } - -func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } - -// TestIssueIndexKey_GroupAware pins that two resources sharing -// kind+namespace+name but in different API groups get independent -// counts. The MCP layer mirrors the REST layer's index — same hazard, -// same fix. -func TestIssueIndexKey_GroupAware(t *testing.T) { - idx := issueIndex{} - idx[issueIndexKey("", "Service", "prod", "api")] = 2 - idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 - - if got := idx.count("", "Service", "prod", "api"); got != 2 { - t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) - } - if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 5 { - t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) - } - if got := idx.count("example.io", "Service", "prod", "api"); got != 0 { - t.Errorf("unknown-group lookup = %d, want 0", got) - } -} - -// TestBuildIssueIndex_GroupAware exercises the full buildIssueIndex -// path with two CRDs that share kind+namespace+name across groups. -func TestBuildIssueIndex_GroupAware(t *testing.T) { - p := &fakeIssuesProvider{ - problems: []k8s.Problem{ - {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, - {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, - {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, - }, - } - idx := buildIssueIndex(p, nil, "") - if got := idx.count("", "Service", "prod", "api"); got != 1 { - t.Errorf("core Service count = %d, want 1", got) - } - if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 2 { - t.Errorf("Knative Service count = %d, want 2", got) - } -} - -// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues -// would fall in the tail beyond MaxLimit still get correct issueCounts. -// Pre-fix, buildIssueIndex passed Limit:MaxLimit (1000) to Compose; on -// a cluster with >1000 issues the post-sort truncation silently zeroed -// tail counts. NoLimit removes the cap because the index is a per- -// resource bucket count, not a paginated list. -func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { - probs := make([]k8s.Problem, 0, issues.MaxLimit+50) - for i := 0; i < issues.MaxLimit+50; i++ { - probs = append(probs, k8s.Problem{ - Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", - }) - } - p := &fakeIssuesProvider{problems: probs} - idx := buildIssueIndex(p, nil, "") - tailName := fmtPodName(issues.MaxLimit + 25) - if got := idx.count("", "Pod", "prod", tailName); got != 1 { - t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) - } - if got := idx.count("", "Pod", "prod", fmtPodName(0)); got != 1 { - t.Errorf("head pod count = %d, want 1", got) - } -} - -// TestSummaryContextBuilderFromIndexes_DispatchesByScope pins the -// dual-index dispatch on the MCP path. Search returns mixed-kind hits -// (namespaced Pods + cluster-scoped Nodes); a single namespace-scoped -// index would zero issueCount on the Node hits because their problems -// live at namespace="". Mirror of the REST-side test in -// internal/server. -func TestSummaryContextBuilderFromIndexes_DispatchesByScope(t *testing.T) { - namespacedIdx := issueIndex{} - namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 - - clusterIdx := issueIndex{} - clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 - - build := summaryContextBuilderFromIndexes(nil, namespacedIdx, clusterIdx) - - if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { - t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) - } - if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { - t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) - } - // Cross-bucket name lookups must not leak. - if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { - t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) - } - if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { - t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) - } -} - -// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the -// constructor: scanNamespaces non-nil → two distinct indexes, one -// scoped, one cluster-wide. Without this, MCP search responses zero -// out issueCount on Node / PV / cluster-scoped CRD hits. Mirror of the -// REST-side test. -func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { - p := &fakeIssuesProvider{ - problems: []k8s.Problem{ - {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, - {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, - }, - } - - namespacedIdx := buildIssueIndex(p, []string{"prod"}, "") - clusterIdx := buildIssueIndex(p, nil, "") - - if got := namespacedIdx.count("", "Node", "", "worker-1"); got != 0 { - t.Errorf("namespacedIdx Node count = %d, want 0 (sanity)", got) - } - if got := clusterIdx.count("", "Node", "", "worker-1"); got != 1 { - t.Errorf("clusterIdx Node count = %d, want 1", got) - } - if got := namespacedIdx.count("", "Pod", "prod", "api-7"); got != 1 { - t.Errorf("namespacedIdx Pod count = %d, want 1", got) - } - - build := summaryContextBuilderFromIndexes(nil, namespacedIdx, clusterIdx) - if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { - t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) - } - if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { - t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) - } -} - -// TestBuildIssueIndex_ClusterScopedIssueRequiresUnfilteredCompose pins -// the MCP-side regression for the cluster-scoped issueCount bug. When -// handleListResources hands a namespace-restricted slice to the issue -// index, cluster-scoped issues (Namespace=="") are dropped by Compose's -// per-namespace problem walk — every Node row gets issueCount=0 even -// when the user has cluster-scoped Node access. The fix routes -// clusterScoped through and forces idxNamespaces=nil before calling -// newSummaryContextBuilder; this test pins the buildIssueIndex behavior -// that backs that path. -func TestBuildIssueIndex_ClusterScopedIssueRequiresUnfilteredCompose(t *testing.T) { - p := &fakeIssuesProvider{ - problems: []k8s.Problem{ - {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, - }, - } - // Cluster-wide compose surfaces the Node issue. - idx := buildIssueIndex(p, nil, "Node") - if got := idx.count("", "Node", "", "worker-1"); got != 1 { - t.Errorf("cluster-wide index: Node issueCount = %d, want 1", got) - } - // Namespace-scoped compose drops the same issue — what the pre-fix - // MCP handler did on every Node list for a namespace-restricted user. - scopedIdx := buildIssueIndex(p, []string{"prod", "staging"}, "Node") - if got := scopedIdx.count("", "Node", "", "worker-1"); got != 0 { - t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) - } -} diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index ef491730e..0f967d9df 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -21,6 +21,7 @@ import ( "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" "github.com/skyhook-io/radar/internal/search" + "github.com/skyhook-io/radar/internal/summarycontext" "github.com/skyhook-io/radar/internal/timeline" aicontext "github.com/skyhook-io/radar/pkg/ai/context" topology "github.com/skyhook-io/radar/pkg/topology" @@ -540,7 +541,7 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li // Group is sourced per-object from each typed object's GVK (SetTypeMeta // is called by Minify, so apiVersion is reliable here) — passed through // to the builder so the per-resource issue lookup stays group-aware. -func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder summaryContextBuilder) { +func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder summarycontext.Builder) { if len(results) != len(objs) { return } @@ -601,7 +602,7 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g // Group comes from each unstructured's apiVersion so two CRDs that share // kind+ns+name across API groups (e.g. multiple operators each shipping // a "Cluster" resource) get independent issue counts. -func attachSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { +func attachSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { if len(results) != len(items) { return } diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 148bcf4be..362e4236c 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -10,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" aicontext "github.com/skyhook-io/radar/pkg/ai/context" ) @@ -140,7 +141,7 @@ func issueIndexNamespaces(namespaces []string, kind, group string) []string { // Group is sourced per-object from the typed object's GVK (via SetTypeMeta // + ObjectKind), so list paths that mix kinds — they don't today, but the // shape doesn't preclude it — stay correct. -func attachSummaryContextToList(results []any, objs []runtime.Object, builder summaryContextBuilder) { +func attachSummaryContextToList(results []any, objs []runtime.Object, builder summarycontext.Builder) { if len(results) != len(objs) { return } @@ -161,7 +162,7 @@ func attachSummaryContextToList(results []any, objs []runtime.Object, builder su // Group comes from each unstructured's apiVersion — required for issue- // index lookups so two CRDs that share kind+ns+name across groups don't // collide on the per-resource count. -func attachSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summaryContextBuilder) { +func attachSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { if len(results) != len(items) { return } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index 6ea894543..2ec5b0bf9 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -1,39 +1,19 @@ // Per-request helpers that compute the compact SummaryContext attached // to /api/ai/resources/{kind} list rows and /api/search hits. // -// The helpers build a single per-namespace issue index and a cached -// topology snapshot up front, then expose a closure callers invoke -// per row. This keeps the per-row cost flat — without the index, -// listing 2000 pods would re-walk the entire issue compose pipeline -// per row. -// -// pkg/resourcecontext intentionally has no dependencies on internal/* -// or pkg/topology; the join happens here. +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the REST-specific wrapper — +// it sources topology from the server-wide broadcaster cache and +// otherwise just plumbs arguments through. package server import ( - "strings" - - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "github.com/skyhook-io/radar/internal/issues" - "github.com/skyhook-io/radar/internal/k8s" - "github.com/skyhook-io/radar/pkg/resourcecontext" - "github.com/skyhook-io/radar/pkg/topology" + "github.com/skyhook-io/radar/internal/summarycontext" ) -// summaryContextBuilder is the per-request closure that produces a -// SummaryContext for a single resource. nil result is fine — the -// SummaryContext field is omitempty on every consumer. -// -// group is required so the per-resource issue lookup can distinguish -// CRDs that share kind+namespace+name across API groups (e.g. Knative -// Service vs corev1 Service, or two custom CRDs both named "Cluster" -// from different operators). Pass "" for core-group resources. -type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext - // newSummaryContextBuilder assembles the per-request closure for the // list/search handlers. Returns nil when the cache or topology isn't // available, in which case callers should skip context attachment @@ -47,20 +27,13 @@ type summaryContextBuilder func(obj runtime.Object, u *unstructured.Unstructured // Use newSearchSummaryContextBuilder for search, which routes per-hit // between a namespaced and a cluster-wide index — search returns mixed // kinds in one response, so a single index can't get both right. -func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string) summaryContextBuilder { +func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - // One pass over the issue engine; group by group/kind/ns/name. We - // rely on Filters.IncludeAudit and Filters.IncludeEvents staying - // false-by-default in buildIssueIndex — that's what keeps the - // per-row count to "problem" + "condition" only. Audit + Warning - // events are loud and require explicit opt-in; rolling them into - // the per-row count would distort "this Pod has 1 issue" for the - // common case. - idx := buildIssueIndex(provider, namespaces, kindFilter) - return s.summaryContextBuilderFromIndexes(idx, idx) + idx := summarycontext.BuildIssueIndex(provider, namespaces, kindFilter) + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), idx, idx) } // newSearchSummaryContextBuilder is the search-specific variant. Search @@ -83,188 +56,15 @@ func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string // The cluster-wide index is skipped when scanNamespaces is already nil // (cluster-wide user) — both indexes would be identical, so one pass // suffices. -func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summaryContextBuilder { +func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - namespacedIdx := buildIssueIndex(provider, scanNamespaces, "") + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces, "") clusterIdx := namespacedIdx if scanNamespaces != nil { - clusterIdx = buildIssueIndex(provider, nil, "") - } - return s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) -} - -// summaryContextBuilderFromIndexes is the shared closure body for the -// list and search variants. namespacedIdx is used for namespaced hits; -// clusterIdx is used for cluster-scoped hits. The list path passes the -// same index for both (single-kind list, scope already chosen by the -// caller); search passes two distinct indexes. -func (s *Server) summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx issueIndex) summaryContextBuilder { - topo := s.broadcaster.GetCachedTopology() - - resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) - dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) - - // One inverted-edges index per request — without it each - // GetRelationships call would re-scan topo.Edges in O(E), turning - // the list/search hot path into O(N × E). See pkg/topology T3. - var relIdx *topology.RelationshipsIndex - if topo != nil { - relIdx = topology.IndexByResource(topo) - } - - return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { - var managedBy *resourcecontext.ManagedByRef - if topo != nil { - // Pass the fetched object when available so synthesis is - // group-aware (avoids kind/plural collisions like Knative - // Service vs corev1 Service). Falls back to (kind, ns, name) - // lookup when neither obj nor u is set. - var rawObj any - switch { - case obj != nil: - rawObj = obj - case u != nil: - rawObj = u - } - rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) - managedBy = managedByFromRelationships(rel) - } - var source runtime.Object = obj - if source == nil && u != nil { - source = u - } - // Dispatch by scope: cluster-scoped hits read from clusterIdx - // (composed at namespace=nil so namespace="" issues are present), - // namespaced hits read from namespacedIdx (which honors the - // user's namespace filter so the per-row count doesn't pull in - // noise from namespaces the user can't see). - idx := namespacedIdx - if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { - idx = clusterIdx - } - return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ - ManagedBy: managedBy, - IssueCount: idx.count(group, kind, namespace, name), - }) - } -} - -// issueIndex keys per-resource issue counts as "group|kind|namespace|name". -// Group goes FIRST so two CRDs sharing kind+namespace+name across API -// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, -// or two operators each shipping a "Cluster" CRD) get independent counts -// instead of inheriting each other's. Kind is canonicalized via -// strings.ToLower because issue sources emit the kind as-typed -// (Deployment) while callers may pass the URL plural (deployments) — -// lowercase normalizes both. "|" can't appear in a Kubernetes API group -// (groups follow DNS subdomain rules: lowercase alphanumerics, "-", -// and "."), so it's a safe delimiter. -type issueIndex map[string]int - -func (i issueIndex) count(group, kind, namespace, name string) int { - return i[issueIndexKey(group, kind, namespace, name)] -} - -func issueIndexKey(group, kind, namespace, name string) string { - return group + "|" + strings.ToLower(canonicalSingular(kind)) + "|" + namespace + "|" + name -} - -// canonicalSingular collapses common plural forms back to the singular -// kind the issue engine emits. Cheap surface — only the kinds we -// actually scan in list_resources / search. -func canonicalSingular(kind string) string { - k := strings.ToLower(kind) - switch k { - case "pods": - return "pod" - case "services": - return "service" - case "deployments": - return "deployment" - case "daemonsets": - return "daemonset" - case "statefulsets": - return "statefulset" - case "replicasets": - return "replicaset" - case "jobs": - return "job" - case "cronjobs": - return "cronjob" - case "ingresses": - return "ingress" - case "configmaps": - return "configmap" - case "secrets": - return "secret" - case "persistentvolumeclaims": - return "persistentvolumeclaim" - case "persistentvolumes": - return "persistentvolume" - case "storageclasses": - return "storageclass" - case "horizontalpodautoscalers", "hpas", "hpa": - return "horizontalpodautoscaler" - case "poddisruptionbudgets": - return "poddisruptionbudget" - case "nodes": - return "node" - case "namespaces": - return "namespace" - case "events": - return "event" - } - return k -} - -func buildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) issueIndex { - // NoLimit (not MaxLimit) is required here: a 5000-issue cluster would - // otherwise truncate after the first 1000 sorted rows, silently - // zeroing issueCount for resources whose issues fall in the tail. - // We're bucketing for a per-resource lookup, not paginating — the - // caller of summaryContext never sees the issue list itself. - filters := issues.Filters{ - Namespaces: namespaces, - Limit: issues.NoLimit, - } - if kindFilter != "" { - // Compose's Kinds filter expects the singular kind ("Pod"). The - // caller may pass either the URL plural ("pods") or the singular — - // canonicalSingular normalizes both before issuing the filter. - filters.Kinds = []string{canonicalSingular(kindFilter)} - } - composed := issues.Compose(p, filters) - idx := make(issueIndex, len(composed)) - for _, iss := range composed { - idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ - } - return idx -} - -// managedByFromRelationships extracts a compact ManagedByRef from -// computed topology relationships. Preference order: -// 1. Relationships.ManagedBy[0] — the server-synthesized topmost -// manager (ArgoCD Application > Flux Kustomization/HelmRelease > -// Helm release > topmost K8s owner). Walks the owner chain past -// ReplicaSets to the controlling Deployment in one shot. -// 2. Direct Owner — fallback for shapes ManagedBy synthesis declines -// (e.g. cluster-scoped roots where the topmost manager is the -// resource itself). -// -// Returns nil when topology has no relationship for the resource. -func managedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { - if rel == nil { - return nil - } - if len(rel.ManagedBy) > 0 { - ref := rel.ManagedBy[0] - return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) - } - if rel.Owner != nil { - return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) + clusterIdx = summarycontext.BuildIssueIndex(provider, nil, "") } - return nil + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), namespacedIdx, clusterIdx) } diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 9e4e18f89..848744c00 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -1,90 +1,31 @@ +// Wiring tests for the REST-side SummaryContext builders. The pure- +// function tests (issueIndex key arithmetic, BuildIssueIndex over a +// fake provider, CanonicalSingular, ManagedByFromRelationships) live in +// internal/summarycontext alongside the shared core they exercise. + package server import ( "encoding/json" - "fmt" "testing" - "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "github.com/skyhook-io/radar/internal/issues" - "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" aicontext "github.com/skyhook-io/radar/pkg/ai/context" - bp "github.com/skyhook-io/radar/pkg/audit" "github.com/skyhook-io/radar/pkg/resourcecontext" - "github.com/skyhook-io/radar/pkg/topology" ) -// k8sProblem is the test-side alias kept short so generated rows -// don't have to repeat the package qualifier. -type k8sProblem = k8s.Problem - -// issuesMaxLimit mirrors internal/issues.MaxLimit at test scope so the -// MaxLimit-overflow assertion doesn't depend on test order against the -// importing package's constant. -var issuesMaxLimit = issues.MaxLimit - -// fakeIssuesProvider is a minimal issues.Provider for the buildIssueIndex -// tests. Only the fields the index path touches are wired; the CRD- -// condition fallback path is exercised by issues' own tests. -// -// DetectProblems mirrors CacheProvider.DetectProblems' shape: -// namespaces=nil returns the full set (including cluster-scoped rows at -// namespace=""); a non-empty namespaces slice drops cluster-scoped rows -// (matching flattenNamespacedProblems) so per-row tests can pin the -// "cluster-scoped issue silently filtered" behavior the production code -// exhibits. -type fakeIssuesProvider struct { - problems []k8s.Problem -} - -func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { - if len(namespaces) == 0 { - return f.problems - } - allowed := map[string]bool{} - for _, ns := range namespaces { - allowed[ns] = true - } - out := make([]k8s.Problem, 0, len(f.problems)) - for _, p := range f.problems { - // Cluster-scoped problems (Namespace=="") are dropped under a - // namespace filter — matches flattenNamespacedProblems in the - // production CacheProvider. - if p.Namespace == "" { - continue - } - if allowed[p.Namespace] { - out = append(out, p) - } - } - return out -} -func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } -func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } -func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { - return nil -} -func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } -func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { - return nil, nil -} -func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } - -func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } - // stubBuilder records calls and returns a deterministic SummaryContext // keyed by the resource identity. Avoids standing up a topology cache or // issue provider — those are exercised by the per-layer unit tests. // // Key shape mirrors the production issueIndexKey (group|kind|ns|name) // so test fixtures pin the group-aware lookup. -func stubBuilder(t *testing.T, want map[string]*resourcecontext.SummaryContext) summaryContextBuilder { +func stubBuilder(t *testing.T, want map[string]*resourcecontext.SummaryContext) summarycontext.Builder { t.Helper() return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { key := group + "|" + kind + "|" + namespace + "|" + name @@ -215,163 +156,6 @@ func TestAttachSummaryContextToUnstructuredList(t *testing.T) { } } -// TestManagedByFromRelationships_PrefersManagedBy pins the topmost-manager -// shortcut: when topology has synthesized a ManagedBy chain (Pod → -// ReplicaSet → Deployment), the helper surfaces the Deployment, not the -// noisy hash-suffixed ReplicaSet that sits in Owner. -func TestManagedByFromRelationships_PrefersManagedBy(t *testing.T) { - rel := &topology.Relationships{ - Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, - ManagedBy: []topology.ResourceRef{ - {Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, - }, - } - got := managedByFromRelationships(rel) - want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} - if got == nil || got.Kind != want.Kind || got.Name != want.Name || got.Namespace != want.Namespace || got.Source != want.Source { - t.Errorf("got %#v, want %#v", got, want) - } -} - -// TestManagedByFromRelationships_FallsBackToOwner covers the case where -// topology synthesis declined ManagedBy (e.g. cluster-scoped roots) — -// we still surface the direct Owner so the row isn't context-less. -func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { - rel := &topology.Relationships{ - Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, - } - got := managedByFromRelationships(rel) - if got == nil { - t.Fatalf("got nil, want Application ref") - } - if got.Source != "argocd" { - t.Errorf("Source = %q, want argocd", got.Source) - } -} - -// TestManagedByFromRelationships_ManagedByWinsOverOwner pins that when -// both ManagedBy and Owner are set, ManagedBy[0] takes precedence — the -// server-synthesized topmost-manager walk should never be shadowed by -// the direct owner ref left over for back-compat. -func TestManagedByFromRelationships_ManagedByWinsOverOwner(t *testing.T) { - rel := &topology.Relationships{ - Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, - ManagedBy: []topology.ResourceRef{ - {Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, - }, - } - got := managedByFromRelationships(rel) - if got == nil || got.Kind != "Application" || got.Source != "argocd" { - t.Errorf("got %#v, want Application/argocd", got) - } -} - -func TestManagedByFromRelationships_NilSafe(t *testing.T) { - if got := managedByFromRelationships(nil); got != nil { - t.Errorf("nil rel: got %#v, want nil", got) - } - if got := managedByFromRelationships(&topology.Relationships{}); got != nil { - t.Errorf("empty rel: got %#v, want nil", got) - } -} - -// TestIssueIndexKey_GroupAware pins that two resources sharing -// kind+namespace+name but in different API groups get independent -// counts. Without group in the key, e.g. Knative serving.knative.dev/ -// Service vs corev1 ""/Service collapse onto one bucket — and either -// the CRD inherits the core Service's count or vice versa. This breaks -// the moment a user has two operators each shipping a kind named -// "Cluster" in the same namespace. -func TestIssueIndexKey_GroupAware(t *testing.T) { - idx := issueIndex{} - // Same kind+ns+name, different groups — must be independent buckets. - idx[issueIndexKey("", "Service", "prod", "api")] = 2 - idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 - - if got := idx.count("", "Service", "prod", "api"); got != 2 { - t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) - } - if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 5 { - t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) - } - // Wrong group lookup is a miss, not a fallback. - if got := idx.count("example.io", "Service", "prod", "api"); got != 0 { - t.Errorf("unknown-group lookup = %d, want 0 (key should not coalesce across groups)", got) - } -} - -// TestBuildIssueIndex_GroupAware exercises the full buildIssueIndex -// path with two CRDs that share kind+namespace+name but live in -// different API groups. Pre-fix, both rows landed under the same -// "service|prod|api" key and one inherited the other's count. -func TestBuildIssueIndex_GroupAware(t *testing.T) { - // Inject via a fake issues.Provider rather than the cache plumbing — - // keeps the test focused on the index-key arithmetic. - p := &fakeIssuesProvider{ - problems: []k8sProblem{ - {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, - {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, - {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, - }, - } - idx := buildIssueIndex(p, nil, "") - if got := idx.count("", "Service", "prod", "api"); got != 1 { - t.Errorf("core Service count = %d, want 1", got) - } - if got := idx.count("serving.knative.dev", "Service", "prod", "api"); got != 2 { - t.Errorf("Knative Service count = %d, want 2", got) - } -} - -// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues -// would fall in the tail beyond MaxLimit still get correct issueCounts. -// Pre-fix, buildIssueIndex passed Limit:MaxLimit (1000) to Compose; on -// a cluster with >1000 issues the post-sort truncation silently zeroed -// out counts for tail resources. The fix is Limit:NoLimit — the index -// is a bucketed count, not a paginated list. -func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { - // Generate MaxLimit+50 problem rows across distinct resources so - // every bucket has exactly one issue. Without the NoLimit fix, the - // last 50 resources' counts collapse to 0. - probs := make([]k8sProblem, 0, issuesMaxLimit+50) - for i := 0; i < issuesMaxLimit+50; i++ { - probs = append(probs, k8sProblem{ - Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", - }) - } - p := &fakeIssuesProvider{problems: probs} - idx := buildIssueIndex(p, nil, "") - // Spot-check a tail resource — anything beyond MaxLimit must still - // resolve to count=1, not 0. - tailName := fmtPodName(issuesMaxLimit + 25) - if got := idx.count("", "Pod", "prod", tailName); got != 1 { - t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) - } - // And the first resource sees its count too — sanity that the - // truncation didn't shift in the other direction. - if got := idx.count("", "Pod", "prod", fmtPodName(0)); got != 1 { - t.Errorf("head pod count = %d, want 1", got) - } -} - -// TestCanonicalSingular pins the kind normalization used to align URL -// plurals with the singular form the issue engine emits. -func TestCanonicalSingular(t *testing.T) { - cases := map[string]string{ - "pods": "pod", - "Pods": "pod", - "Deployment": "deployment", - "deployments": "deployment", - "hpa": "horizontalpodautoscaler", - "unknownkind": "unknownkind", - } - for in, want := range cases { - if got := canonicalSingular(in); got != want { - t.Errorf("canonicalSingular(%q) = %q, want %q", in, got, want) - } - } -} - // contains is a tiny strings.Contains alias kept local so the test file // doesn't need a strings import alongside the existing imports. func contains(s, sub string) bool { @@ -445,140 +229,3 @@ func TestIssueIndexNamespaces_ClusterScopedDropsFilter(t *testing.T) { t.Errorf("issueIndexNamespaces(nil, Pod) = %v, want nil", got) } } - -// TestSummaryContextBuilderFromIndexes_DispatchesByScope pins the -// dual-index dispatch: cluster-scoped hits (Node, PV, …) read the -// cluster-wide index (where namespace="" issues live), namespaced hits -// (Pod, Deployment, …) read the namespace-scoped index. Without this -// dispatch, a search response that mixes Pods and Nodes silently zeros -// issueCount on the Node hits — the namespace-scoped index drops every -// namespace="" issue. -// -// This pins what the search-handler-level fix relies on: the two -// indexes must be wired to the two scopes via the closure, not the -// other way around. A wiring inversion (cluster-scoped → namespaced -// index) would re-introduce the bug. -func TestSummaryContextBuilderFromIndexes_DispatchesByScope(t *testing.T) { - // Build two distinct indexes so we can tell which one was consulted. - // The cluster index sees a Node issue at namespace=""; the - // namespaced index has a Pod issue in "prod". An index leak would - // surface either the Node count under Pod or vice versa. - namespacedIdx := issueIndex{} - namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 - - clusterIdx := issueIndex{} - clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 - - // Server with a no-op broadcaster — the builder reads - // GetCachedTopology() which returns nil when no topology has been - // built; managedBy will be nil but issueCount dispatch is what we're - // pinning here. - s := &Server{broadcaster: NewSSEBroadcaster()} - build := s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) - - // Cluster-scoped Node hit — must read clusterIdx. - if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { - t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) - } - // Namespaced Pod hit — must read namespacedIdx. - if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { - t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) - } - // A cluster-scoped hit whose name only lives in the namespaced - // index must return 0 (no cross-bucket leak). - if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { - t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) - } - // And a namespaced hit whose name only lives in the cluster index - // likewise returns 0. - if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { - t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) - } -} - -// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end -// shape used by /api/search and MCP search: scanNamespaces is non-nil -// (a namespace-restricted user, or a user with a `ns:` query modifier), -// so the constructor must compose TWO issue indexes — one scoped to -// those namespaces, one cluster-wide for cluster-scoped hits. Without -// the second index, the Node hit's summaryContext.issueCount returns -// 0 because every Node issue lives at namespace="" and the namespace -// filter drops them. -// -// Exercise via the issues.Provider seam: a fakeIssuesProvider that -// emits one Node problem at namespace="" and one Pod problem in -// "prod". With scanNamespaces=["prod"], the Node count must still -// surface (proving the cluster-wide index was built and routed to). -func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { - p := &fakeIssuesProvider{ - problems: []k8sProblem{ - {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, - {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, - }, - } - - // Build the two indexes the constructor would build. - namespacedIdx := buildIssueIndex(p, []string{"prod"}, "") - clusterIdx := buildIssueIndex(p, nil, "") - - // Sanity: pre-fix, the search handler passed namespacedIdx for - // both; Node issueCount silently zeroed. - if got := namespacedIdx.count("", "Node", "", "worker-1"); got != 0 { - t.Errorf("namespacedIdx Node count = %d, want 0 (sanity — namespace filter drops cluster-scoped issues)", got) - } - if got := clusterIdx.count("", "Node", "", "worker-1"); got != 1 { - t.Errorf("clusterIdx Node count = %d, want 1 (cluster-wide compose surfaces namespace=\"\" issues)", got) - } - - // And the namespaced Pod issue must surface in the namespaced - // index — search RBAC has already gated namespace visibility, so - // the per-row count should respect the scan boundary instead of - // composing cluster-wide and pulling in noise from other - // namespaces. - if got := namespacedIdx.count("", "Pod", "prod", "api-7"); got != 1 { - t.Errorf("namespacedIdx Pod count = %d, want 1", got) - } - - // With both indexes built, the closure dispatches per-hit by - // scope. Replay the dispatch via the shared helper to pin the - // end-to-end shape. - s := &Server{broadcaster: NewSSEBroadcaster()} - build := s.summaryContextBuilderFromIndexes(namespacedIdx, clusterIdx) - if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { - t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) - } - if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { - t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) - } -} - -// TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the -// end-to-end behavior the issueIndexNamespaces helper enables: when the -// builder passes nil for the namespace filter (cluster-scoped kind), -// node-level issues at namespace="" surface in the index and the -// per-resource lookup returns the correct count. With a namespace -// filter populated, those same issues are dropped because Compose's -// per-namespace problem walk never sees them. -func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) { - p := &fakeIssuesProvider{ - problems: []k8sProblem{ - // Cluster-scoped Node issue: namespace="" — the actual shape - // k8s.DetectProblems emits for NodeNotReady / DiskPressure etc. - {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, - }, - } - - // Cluster-wide compose (nil namespaces) — issue surfaces. - idx := buildIssueIndex(p, nil, "Node") - if got := idx.count("", "Node", "", "worker-1"); got != 1 { - t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) - } - - // Namespace-scoped compose — same issue, but ns filter to ["prod","staging"] - // drops it because the user-namespaced perm slice never matches "". - // This is what the pre-fix handler did for Node lists. - scopedIdx := buildIssueIndex(p, []string{"prod", "staging"}, "Node") - if got := scopedIdx.count("", "Node", "", "worker-1"); got != 0 { - t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) - } -} diff --git a/internal/summarycontext/summarycontext.go b/internal/summarycontext/summarycontext.go new file mode 100644 index 000000000..0123a858c --- /dev/null +++ b/internal/summarycontext/summarycontext.go @@ -0,0 +1,221 @@ +// Package summarycontext is the shared core that powers the compact +// SummaryContext attached to /api/ai/resources/{kind} list rows, /api/search +// hits, and the MCP list_resources / search variants. +// +// The REST and MCP wrappers (internal/server, internal/mcp) differ only +// in their topology source — REST reads from a server-wide broadcaster +// cache; MCP memoizes per-process builds. Everything else (issue index, +// kind canonicalization, managedBy resolution, per-row dispatch by +// scope) is identical, so it lives here. +// +// pkg/resourcecontext intentionally has no dependencies on internal/* +// or pkg/topology; the join happens here. +package summarycontext + +import ( + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// Builder is the per-request closure that produces a SummaryContext for +// a single resource. nil result is fine — the SummaryContext field is +// omitempty on every consumer. +// +// group is required so the per-resource issue lookup can distinguish +// CRDs that share kind+namespace+name across API groups (e.g. Knative +// Service vs corev1 Service, or two custom CRDs both named "Cluster" +// from different operators). Pass "" for core-group resources. +type Builder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext + +// BuilderFromIndexes assembles the per-request closure. The list path +// passes the same index for both namespacedIdx and clusterIdx (single- +// kind list, scope already chosen by the caller); search passes two +// distinct indexes — namespacedIdx scoped to user namespaces, clusterIdx +// composed cluster-wide. The closure dispatches per-hit by scope so +// cluster-scoped hits read the cluster-wide index and surface +// namespace="" issues that the namespaced filter would otherwise drop. +// +// topo is the topology snapshot the caller has already obtained from +// its preferred source (REST: broadcaster cache; MCP: short-TTL +// memoizer). nil topo is fine — managedBy is omitted but issueCount +// still resolves. +func BuilderFromIndexes(topo *topology.Topology, namespacedIdx, clusterIdx IssueIndex) Builder { + resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) + dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + // One inverted-edges index per request — without it each + // GetRelationships call would re-scan topo.Edges in O(E), turning + // the list/search hot path into O(N × E). See pkg/topology T3. + var relIdx *topology.RelationshipsIndex + if topo != nil { + relIdx = topology.IndexByResource(topo) + } + + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { + var managedBy *resourcecontext.ManagedByRef + if topo != nil { + // Pass the fetched object when available so synthesis is + // group-aware (avoids kind/plural collisions like Knative + // Service vs corev1 Service). Falls back to (kind, ns, name) + // lookup when neither obj nor u is set. + var rawObj any + switch { + case obj != nil: + rawObj = obj + case u != nil: + rawObj = u + } + rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) + managedBy = ManagedByFromRelationships(rel) + } + var source runtime.Object = obj + if source == nil && u != nil { + source = u + } + // Dispatch by scope: cluster-scoped hits read clusterIdx (composed + // at namespace=nil so namespace="" issues are present), namespaced + // hits read namespacedIdx (which honors the user's namespace + // filter so the per-row count doesn't pull in noise from + // namespaces the user can't see). + idx := namespacedIdx + if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { + idx = clusterIdx + } + return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ + ManagedBy: managedBy, + IssueCount: idx.Count(group, kind, namespace, name), + }) + } +} + +// IssueIndex keys per-resource issue counts as "group|kind|namespace|name". +// Group goes FIRST so two CRDs sharing kind+namespace+name across API +// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, +// or two operators each shipping a "Cluster" CRD) get independent counts +// instead of inheriting each other's. Kind is canonicalized via +// CanonicalSingular because issue sources emit the kind as-typed +// (Deployment) while callers may pass the URL plural (deployments) — +// canonicalization normalizes both. "|" can't appear in a Kubernetes API +// group (groups follow DNS subdomain rules), so it's a safe delimiter. +type IssueIndex map[string]int + +// Count returns the per-resource issue count, keyed by the group-aware +// composite key. Zero on miss. +func (i IssueIndex) Count(group, kind, namespace, name string) int { + return i[issueIndexKey(group, kind, namespace, name)] +} + +func issueIndexKey(group, kind, namespace, name string) string { + return group + "|" + strings.ToLower(CanonicalSingular(kind)) + "|" + namespace + "|" + name +} + +// CanonicalSingular collapses common plural forms back to the singular +// kind the issue engine emits. Cheap surface — only the kinds we +// actually scan in list_resources / search. +func CanonicalSingular(kind string) string { + k := strings.ToLower(kind) + switch k { + case "pods": + return "pod" + case "services": + return "service" + case "deployments": + return "deployment" + case "daemonsets": + return "daemonset" + case "statefulsets": + return "statefulset" + case "replicasets": + return "replicaset" + case "jobs": + return "job" + case "cronjobs": + return "cronjob" + case "ingresses": + return "ingress" + case "configmaps": + return "configmap" + case "secrets": + return "secret" + case "persistentvolumeclaims": + return "persistentvolumeclaim" + case "persistentvolumes": + return "persistentvolume" + case "storageclasses": + return "storageclass" + case "horizontalpodautoscalers", "hpas", "hpa": + return "horizontalpodautoscaler" + case "poddisruptionbudgets": + return "poddisruptionbudget" + case "nodes": + return "node" + case "namespaces": + return "namespace" + case "events": + return "event" + } + return k +} + +// BuildIssueIndex composes the per-request issue index. NoLimit (not +// MaxLimit) is required here: a 5000-issue cluster would otherwise +// truncate after the first 1000 sorted rows, silently zeroing +// issueCount for resources whose issues fall in the tail. We're +// bucketing for a per-resource lookup, not paginating — the caller of +// the builder never sees the issue list itself. +// +// We rely on Filters.IncludeAudit and Filters.IncludeEvents staying +// false-by-default — that's what keeps the per-row count to "problem" +// + "condition" only. Audit + Warning events are loud and require +// explicit opt-in; rolling them into the per-row count would distort +// "this Pod has 1 issue" for the common case. +func BuildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) IssueIndex { + filters := issues.Filters{ + Namespaces: namespaces, + Limit: issues.NoLimit, + } + if kindFilter != "" { + // Compose's Kinds filter expects the singular kind ("Pod"). The + // caller may pass either the URL plural ("pods") or the singular — + // CanonicalSingular normalizes both before issuing the filter. + filters.Kinds = []string{CanonicalSingular(kindFilter)} + } + composed := issues.Compose(p, filters) + idx := make(IssueIndex, len(composed)) + for _, iss := range composed { + idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ + } + return idx +} + +// ManagedByFromRelationships extracts a compact ManagedByRef from +// computed topology relationships. Preference order: +// 1. Relationships.ManagedBy[0] — the server-synthesized topmost +// manager (ArgoCD Application > Flux Kustomization/HelmRelease > +// Helm release > topmost K8s owner). Walks the owner chain past +// ReplicaSets to the controlling Deployment in one shot. +// 2. Direct Owner — fallback for shapes ManagedBy synthesis declines +// (e.g. cluster-scoped roots where the topmost manager is the +// resource itself). +// +// Returns nil when topology has no relationship for the resource. +func ManagedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { + if rel == nil { + return nil + } + if len(rel.ManagedBy) > 0 { + ref := rel.ManagedBy[0] + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + } + if rel.Owner != nil { + return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) + } + return nil +} diff --git a/internal/summarycontext/summarycontext_test.go b/internal/summarycontext/summarycontext_test.go new file mode 100644 index 000000000..b91c0a7e5 --- /dev/null +++ b/internal/summarycontext/summarycontext_test.go @@ -0,0 +1,334 @@ +// Pure-function tests for the shared summarycontext core. The +// REST/MCP-specific wiring tests (attachSummaryContextToList, +// dispatch-on-CanReadClusterScoped, the ai-handler issueIndexNamespaces +// helper) stay at their respective handler sites in internal/server +// and internal/mcp. + +package summarycontext + +import ( + "fmt" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bp "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// fakeIssuesProvider is a minimal issues.Provider for the BuildIssueIndex +// tests. Only the fields the index path touches are wired. +// +// DetectProblems mirrors CacheProvider.DetectProblems: empty namespaces +// returns the full set; a non-empty slice drops cluster-scoped rows +// (Namespace=="") to match the production flattenNamespacedProblems +// behavior — needed so the cluster-scoped-filter regression test can +// pin the actual bug. +type fakeIssuesProvider struct { + problems []k8s.Problem +} + +func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { + if len(namespaces) == 0 { + return f.problems + } + allowed := map[string]bool{} + for _, ns := range namespaces { + allowed[ns] = true + } + out := make([]k8s.Problem, 0, len(f.problems)) + for _, p := range f.problems { + if p.Namespace == "" { + continue + } + if allowed[p.Namespace] { + out = append(out, p) + } + } + return out +} +func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } +func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } +func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { + return nil +} +func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } +func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { + return nil, nil +} +func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } + +func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } + +// TestIssueIndexKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups get independent +// counts. Without group in the key, e.g. Knative serving.knative.dev/ +// Service vs corev1 ""/Service collapse onto one bucket — and either +// the CRD inherits the core Service's count or vice versa. This breaks +// the moment a user has two operators each shipping a kind named +// "Cluster" in the same namespace. +func TestIssueIndexKey_GroupAware(t *testing.T) { + idx := IssueIndex{} + // Same kind+ns+name, different groups — must be independent buckets. + idx[issueIndexKey("", "Service", "prod", "api")] = 2 + idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 + + if got := idx.Count("", "Service", "prod", "api"); got != 2 { + t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 5 { + t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) + } + // Wrong group lookup is a miss, not a fallback. + if got := idx.Count("example.io", "Service", "prod", "api"); got != 0 { + t.Errorf("unknown-group lookup = %d, want 0 (key should not coalesce across groups)", got) + } +} + +// TestBuildIssueIndex_GroupAware exercises the full BuildIssueIndex +// path with two CRDs that share kind+namespace+name but live in +// different API groups. Pre-fix, both rows landed under the same +// "service|prod|api" key and one inherited the other's count. +func TestBuildIssueIndex_GroupAware(t *testing.T) { + // Inject via a fake issues.Provider rather than the cache plumbing — + // keeps the test focused on the index-key arithmetic. + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, + }, + } + idx := BuildIssueIndex(p, nil, "") + if got := idx.Count("", "Service", "prod", "api"); got != 1 { + t.Errorf("core Service count = %d, want 1", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 2 { + t.Errorf("Knative Service count = %d, want 2", got) + } +} + +// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues +// would fall in the tail beyond MaxLimit still get correct issueCounts. +// Pre-fix, BuildIssueIndex passed Limit:MaxLimit (1000) to Compose; on +// a cluster with >1000 issues the post-sort truncation silently zeroed +// out counts for tail resources. The fix is Limit:NoLimit — the index +// is a bucketed count, not a paginated list. +func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { + probs := make([]k8s.Problem, 0, issues.MaxLimit+50) + for i := 0; i < issues.MaxLimit+50; i++ { + probs = append(probs, k8s.Problem{ + Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", + }) + } + p := &fakeIssuesProvider{problems: probs} + idx := BuildIssueIndex(p, nil, "") + tailName := fmtPodName(issues.MaxLimit + 25) + if got := idx.Count("", "Pod", "prod", tailName); got != 1 { + t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) + } + if got := idx.Count("", "Pod", "prod", fmtPodName(0)); got != 1 { + t.Errorf("head pod count = %d, want 1", got) + } +} + +// TestCanonicalSingular pins the kind normalization used to align URL +// plurals with the singular form the issue engine emits. +func TestCanonicalSingular(t *testing.T) { + cases := map[string]string{ + "pods": "pod", + "Pods": "pod", + "Deployment": "deployment", + "deployments": "deployment", + "hpa": "horizontalpodautoscaler", + "unknownkind": "unknownkind", + } + for in, want := range cases { + if got := CanonicalSingular(in); got != want { + t.Errorf("CanonicalSingular(%q) = %q, want %q", in, got, want) + } + } +} + +// TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the +// end-to-end behavior: when the builder passes nil for the namespace +// filter (cluster-scoped kind), node-level issues at namespace="" +// surface in the index and the per-resource lookup returns the correct +// count. With a namespace filter populated, those same issues are +// dropped because Compose's per-namespace problem walk never sees them. +func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + // Cluster-scoped Node issue: namespace="" — the actual shape + // k8s.DetectProblems emits for NodeNotReady / DiskPressure etc. + {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + }, + } + + // Cluster-wide compose (nil namespaces) — issue surfaces. + idx := BuildIssueIndex(p, nil, "Node") + if got := idx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) + } + + // Namespace-scoped compose — same issue, but ns filter to + // ["prod","staging"] drops it because the user-namespaced perm + // slice never matches "". This is what the pre-fix handler did for + // Node lists. + scopedIdx := BuildIssueIndex(p, []string{"prod", "staging"}, "Node") + if got := scopedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) + } +} + +// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end +// shape used by /api/search and MCP search: scanNamespaces is non-nil +// (a namespace-restricted user, or a user with a `ns:` query modifier), +// so the constructor must compose TWO issue indexes — one scoped to +// those namespaces, one cluster-wide for cluster-scoped hits. Without +// the second index, the Node hit's summaryContext.issueCount returns +// 0 because every Node issue lives at namespace="" and the namespace +// filter drops them. +func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, + }, + } + + // Build the two indexes the search constructor would build. + namespacedIdx := BuildIssueIndex(p, []string{"prod"}, "") + clusterIdx := BuildIssueIndex(p, nil, "") + + // Sanity: pre-fix, the search handler passed namespacedIdx for + // both; Node issueCount silently zeroed. + if got := namespacedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespacedIdx Node count = %d, want 0 (sanity — namespace filter drops cluster-scoped issues)", got) + } + if got := clusterIdx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("clusterIdx Node count = %d, want 1 (cluster-wide compose surfaces namespace=\"\" issues)", got) + } + if got := namespacedIdx.Count("", "Pod", "prod", "api-7"); got != 1 { + t.Errorf("namespacedIdx Pod count = %d, want 1", got) + } + + // With both indexes built, the closure dispatches per-hit by + // scope. Replay the dispatch via the shared helper to pin the + // end-to-end shape. Topology is nil; managedBy is nil but + // issueCount dispatch is what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) + } +} + +// TestBuilderFromIndexes_DispatchesByScope pins the dual-index dispatch: +// cluster-scoped hits (Node, PV, …) read the cluster-wide index (where +// namespace="" issues live), namespaced hits (Pod, Deployment, …) read +// the namespace-scoped index. Without this dispatch, a search response +// that mixes Pods and Nodes silently zeros issueCount on the Node hits +// — the namespace-scoped index drops every namespace="" issue. +// +// A wiring inversion (cluster-scoped → namespaced index) would +// re-introduce the bug, so we additionally assert no cross-bucket leak. +func TestBuilderFromIndexes_DispatchesByScope(t *testing.T) { + // Build two distinct indexes so we can tell which one was consulted. + namespacedIdx := IssueIndex{} + namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 + + clusterIdx := IssueIndex{} + clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 + + // Topology is nil — managedBy is nil but issueCount dispatch is + // what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + + // Cluster-scoped Node hit — must read clusterIdx. + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { + t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) + } + // Namespaced Pod hit — must read namespacedIdx. + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { + t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) + } + // A cluster-scoped hit whose name only lives in the namespaced + // index must return 0 (no cross-bucket leak). + if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) + } + // And a namespaced hit whose name only lives in the cluster index + // likewise returns 0. + if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) + } +} + +// TestManagedByFromRelationships_PrefersManagedBy pins the topmost-manager +// shortcut: when topology has synthesized a ManagedBy chain (Pod → +// ReplicaSet → Deployment), the helper surfaces the Deployment, not the +// noisy hash-suffixed ReplicaSet that sits in Owner. +func TestManagedByFromRelationships_PrefersManagedBy(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + }, + } + got := ManagedByFromRelationships(rel) + want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} + if got == nil || got.Kind != want.Kind || got.Name != want.Name || got.Namespace != want.Namespace || got.Source != want.Source { + t.Errorf("got %#v, want %#v", got, want) + } +} + +// TestManagedByFromRelationships_FallsBackToOwner covers the case where +// topology synthesis declined ManagedBy (e.g. cluster-scoped roots) — +// we still surface the direct Owner so the row isn't context-less. +func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + } + got := ManagedByFromRelationships(rel) + if got == nil { + t.Fatalf("got nil, want Application ref") + } + if got.Source != "argocd" { + t.Errorf("Source = %q, want argocd", got.Source) + } +} + +// TestManagedByFromRelationships_ManagedByWinsOverOwner pins that when +// both ManagedBy and Owner are set, ManagedBy[0] takes precedence — the +// server-synthesized topmost-manager walk should never be shadowed by +// the direct owner ref left over for back-compat. +func TestManagedByFromRelationships_ManagedByWinsOverOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + }, + } + got := ManagedByFromRelationships(rel) + if got == nil || got.Kind != "Application" || got.Source != "argocd" { + t.Errorf("got %#v, want Application/argocd", got) + } +} + +func TestManagedByFromRelationships_NilSafe(t *testing.T) { + if got := ManagedByFromRelationships(nil); got != nil { + t.Errorf("nil rel: got %#v, want nil", got) + } + if got := ManagedByFromRelationships(&topology.Relationships{}); got != nil { + t.Errorf("empty rel: got %#v, want nil", got) + } +} From c51e0b1e265479727547953d9c465b0b02906ad5 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 16:42:26 +0300 Subject: [PATCH 19/33] =?UTF-8?q?refactor(resourcecontext):=20rename=20Sum?= =?UTF-8?q?maryContext=20=E2=86=92=20ResourceSummaryContext?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telegraph the relationship to ResourceContext: this is the row-tier companion of the detail-tier ResourceContext. The previous "SummaryContext" name implied a different concept; the new name makes the role explicit (row-tier enrichment on lists + search hits). ManagedByRef stays flat — only the type rename plus the field reference chase across server / mcp / search / summarycontext / ai-context callers. Wire JSON tag is unchanged ("summaryContext") so external consumers see no shape difference. --- internal/mcp/summary_context.go | 8 ++-- internal/mcp/tools.go | 16 +++---- internal/search/search.go | 2 +- internal/search/summary_context_test.go | 4 +- internal/search/types.go | 3 +- internal/server/ai_handlers.go | 18 ++++---- internal/server/summary_context.go | 6 +-- internal/server/summary_context_test.go | 38 +++++++-------- internal/summarycontext/summarycontext.go | 10 ++-- pkg/ai/context/summary.go | 56 ++++++++++++----------- pkg/resourcecontext/summary.go | 9 ++-- pkg/resourcecontext/summary_test.go | 2 +- pkg/resourcecontext/types.go | 15 +++--- pkg/resourcecontext/types_test.go | 16 +++---- 14 files changed, 104 insertions(+), 99 deletions(-) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index f5af123df..85ff7b122 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -1,4 +1,4 @@ -// Per-request helpers that compute the compact SummaryContext attached +// Per-request helpers that compute the compact ResourceSummaryContext attached // to list_resources rows and search hits served via MCP. // // The shared core (issue index, kind canonicalization, managedBy @@ -18,7 +18,7 @@ import ( "github.com/skyhook-io/radar/pkg/topology" ) -// newSummaryContextBuilder assembles the per-request closure for MCP +// newResourceSummaryContextBuilder assembles the per-request closure for MCP // list_resources. Returns nil when the cache or topology isn't // available, in which case the caller should skip context attachment // rather than emit empty objects. @@ -32,7 +32,7 @@ import ( // per-hit between a namespaced and a cluster-wide index — search // returns mixed kinds in one response, so a single index can't get // both right. -func newSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { +func newResourceSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil @@ -76,7 +76,7 @@ var summaryCtxTopoMemo = topology.NewMemoizer(5 * time.Second) // buildSummaryContextTopology returns a topology snapshot suitable for // resolving managedBy pointers, reusing a cached snapshot when one is // fresh. Returns nil on failure — the caller falls back to a -// managedBy-less SummaryContext rather than failing the response. +// managedBy-less ResourceSummaryContext rather than failing the response. func buildSummaryContextTopology(namespaces []string) *topology.Topology { cache := k8s.GetResourceCache() if cache == nil { diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 0f967d9df..88cb0d1fa 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -525,15 +525,15 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li if clusterScoped { idxNamespaces = nil } - if builder := newSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachSummaryContextToTyped(results, objs, builder) + if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + attachResourceSummaryContextToTyped(results, objs, builder) } } return toJSONResult(results) } -// attachSummaryContextToTyped fills in SummaryContext on each +// attachResourceSummaryContextToTyped fills in SummaryContext on each // Summary-verbosity ResourceSummary in-place. results and objs are // produced in lockstep by MinifyList — a length mismatch is defensive // (skip rather than panic). @@ -541,7 +541,7 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li // Group is sourced per-object from each typed object's GVK (SetTypeMeta // is called by Minify, so apiVersion is reliable here) — passed through // to the builder so the per-resource issue lookup stays group-aware. -func attachSummaryContextToTyped(results []any, objs []runtime.Object, builder summarycontext.Builder) { +func attachResourceSummaryContextToTyped(results []any, objs []runtime.Object, builder summarycontext.Builder) { if len(results) != len(objs) { return } @@ -587,22 +587,22 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g if clusterScoped { idxNamespaces = nil } - if builder := newSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachSummaryContextToUnstructured(allItems, rawItems, builder) + if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + attachResourceSummaryContextToUnstructured(allItems, rawItems, builder) } } return toJSONResult(allItems) } -// attachSummaryContextToUnstructured fills in SummaryContext for the +// attachResourceSummaryContextToUnstructured fills in SummaryContext for the // dynamic-CRD list path. summarizeUnstructured returns // *aicontext.ResourceSummary, so the cast matches the typed path. // // Group comes from each unstructured's apiVersion so two CRDs that share // kind+ns+name across API groups (e.g. multiple operators each shipping // a "Cluster" resource) get independent issue counts. -func attachSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { +func attachResourceSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { if len(results) != len(items) { return } diff --git a/internal/search/search.go b/internal/search/search.go index f7b352809..df7b5da01 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -37,7 +37,7 @@ import ( // it through lets the builder distinguish CRDs that share // kind+namespace+name across groups (e.g. Knative Service vs corev1 // Service) in its per-resource issue index. -type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext // Provider abstracts the cache so tests can inject a fake. type Provider interface { diff --git a/internal/search/summary_context_test.go b/internal/search/summary_context_test.go index 0c54e0d72..0f8b3228e 100644 --- a/internal/search/summary_context_test.go +++ b/internal/search/summary_context_test.go @@ -32,10 +32,10 @@ func TestSearch_SummaryBuilderAttached(t *testing.T) { var calls int var gotGroup string - builder := func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { + builder := func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { calls++ gotGroup = group - return &resourcecontext.SummaryContext{ + return &resourcecontext.ResourceSummaryContext{ ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: namespace}, Health: "healthy", IssueCount: 0, diff --git a/internal/search/types.go b/internal/search/types.go index c37112b55..b33215853 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -52,7 +52,7 @@ type Hit struct { // SummaryContext is the compact per-row enrichment (managedBy, health, // issueCount). Populated by handlers via Options.SummaryBuilder; nil // when the caller opted out (context=none) or no fields apply. - SummaryContext *resourcecontext.SummaryContext `json:"summaryContext,omitempty"` + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // MatchedField records where a query token landed (debug + UI highlight). @@ -93,4 +93,3 @@ const ( IncludeRaw IncludeNone // identity only (cheapest) ) - diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 362e4236c..bdbc7aa35 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -108,11 +108,11 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // so we pass nil here to compose cluster-wide. if !skipContext && level == aicontext.LevelSummary { idxNamespaces := issueIndexNamespaces(namespaces, kind, group) - if builder := s.newSummaryContextBuilder(idxNamespaces, kind); builder != nil { + if builder := s.newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { // Typed list resolves group from each object's TypeMeta — // MinifyList sets it via SetTypeMeta before producing rows, // so we can trust apiVersion on the typed source. - attachSummaryContextToList(results, objs, builder) + attachResourceSummaryContextToList(results, objs, builder) } } @@ -132,8 +132,8 @@ func issueIndexNamespaces(namespaces []string, kind, group string) []string { return namespaces } -// attachSummaryContextToList walks the typed-cache list and assigns the -// per-row SummaryContext into each ResourceSummary in-place. results and +// attachResourceSummaryContextToList walks the typed-cache list and assigns the +// per-row ResourceSummaryContext into each ResourceSummary in-place. results and // objs are produced in lockstep by MinifyList; a length mismatch is // defensive (and silently skips attachment rather than panicking) but // shouldn't occur in practice. @@ -141,7 +141,7 @@ func issueIndexNamespaces(namespaces []string, kind, group string) []string { // Group is sourced per-object from the typed object's GVK (via SetTypeMeta // + ObjectKind), so list paths that mix kinds — they don't today, but the // shape doesn't preclude it — stay correct. -func attachSummaryContextToList(results []any, objs []runtime.Object, builder summarycontext.Builder) { +func attachResourceSummaryContextToList(results []any, objs []runtime.Object, builder summarycontext.Builder) { if len(results) != len(objs) { return } @@ -155,14 +155,14 @@ func attachSummaryContextToList(results []any, objs []runtime.Object, builder su } } -// attachSummaryContextToUnstructuredList does the same for the dynamic +// attachResourceSummaryContextToUnstructuredList does the same for the dynamic // CRD path. MinifyUnstructured returns *ResourceSummary (Summary level) // so the cast is the same shape. // // Group comes from each unstructured's apiVersion — required for issue- // index lookups so two CRDs that share kind+ns+name across groups don't // collide on the per-resource count. -func attachSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { +func attachResourceSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { if len(results) != len(items) { return } @@ -234,8 +234,8 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 if !skipContext && level == aicontext.LevelSummary { idxNamespaces := issueIndexNamespaces(namespaces, kind, group) - if builder := s.newSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachSummaryContextToUnstructuredList(results, allItems, builder) + if builder := s.newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + attachResourceSummaryContextToUnstructuredList(results, allItems, builder) } } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index 2ec5b0bf9..ad227c75b 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -1,4 +1,4 @@ -// Per-request helpers that compute the compact SummaryContext attached +// Per-request helpers that compute the compact ResourceSummaryContext attached // to /api/ai/resources/{kind} list rows and /api/search hits. // // The shared core (issue index, kind canonicalization, managedBy @@ -14,7 +14,7 @@ import ( "github.com/skyhook-io/radar/internal/summarycontext" ) -// newSummaryContextBuilder assembles the per-request closure for the +// newResourceSummaryContextBuilder assembles the per-request closure for the // list/search handlers. Returns nil when the cache or topology isn't // available, in which case callers should skip context attachment // rather than emit empty objects. @@ -27,7 +27,7 @@ import ( // Use newSearchSummaryContextBuilder for search, which routes per-hit // between a namespaced and a cluster-wide index — search returns mixed // kinds in one response, so a single index can't get both right. -func (s *Server) newSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { +func (s *Server) newResourceSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 848744c00..414a92547 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -1,4 +1,4 @@ -// Wiring tests for the REST-side SummaryContext builders. The pure- +// Wiring tests for the REST-side ResourceSummaryContext builders. The pure- // function tests (issueIndex key arithmetic, BuildIssueIndex over a // fake provider, CanonicalSingular, ManagedByFromRelationships) live in // internal/summarycontext alongside the shared core they exercise. @@ -19,24 +19,24 @@ import ( "github.com/skyhook-io/radar/pkg/resourcecontext" ) -// stubBuilder records calls and returns a deterministic SummaryContext +// stubBuilder records calls and returns a deterministic ResourceSummaryContext // keyed by the resource identity. Avoids standing up a topology cache or // issue provider — those are exercised by the per-layer unit tests. // // Key shape mirrors the production issueIndexKey (group|kind|ns|name) // so test fixtures pin the group-aware lookup. -func stubBuilder(t *testing.T, want map[string]*resourcecontext.SummaryContext) summarycontext.Builder { +func stubBuilder(t *testing.T, want map[string]*resourcecontext.ResourceSummaryContext) summarycontext.Builder { t.Helper() - return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { key := group + "|" + kind + "|" + namespace + "|" + name return want[key] } } -// TestAttachSummaryContextToList wires together MinifyList + the -// per-row attach helper and asserts the SummaryContext field lands in +// TestAttachResourceSummaryContextToList wires together MinifyList + the +// per-row attach helper and asserts the ResourceSummaryContext field lands in // the JSON each row marshals to. -func TestAttachSummaryContextToList(t *testing.T) { +func TestAttachResourceSummaryContextToList(t *testing.T) { objs := []runtime.Object{ &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "api-1", Namespace: "prod"}, @@ -48,7 +48,7 @@ func TestAttachSummaryContextToList(t *testing.T) { }, } // Group is "" for core-group Pods. - want := map[string]*resourcecontext.SummaryContext{ + want := map[string]*resourcecontext.ResourceSummaryContext{ "|Pod|prod|api-1": { ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, Health: "healthy", @@ -65,7 +65,7 @@ func TestAttachSummaryContextToList(t *testing.T) { if err != nil { t.Fatalf("MinifyList: %v", err) } - attachSummaryContextToList(results, objs, stubBuilder(t, want)) + attachResourceSummaryContextToList(results, objs, stubBuilder(t, want)) // Row 0 — healthy pod. b, _ := json.Marshal(results[0]) @@ -93,10 +93,10 @@ func TestAttachSummaryContextToList(t *testing.T) { } } -// TestAttachSummaryContextToList_MismatchedLengthsSilent — defensive +// TestAttachResourceSummaryContextToList_MismatchedLengthsSilent — defensive // path that protects against a future refactor where MinifyList might // drop unsupported kinds. Attach must skip rather than panic. -func TestAttachSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { +func TestAttachResourceSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { objs := []runtime.Object{ &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "api-1"}}, } @@ -105,8 +105,8 @@ func TestAttachSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, } // Length mismatch (1 obj vs 2 results) — must not panic, must skip. - attachSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { - return &resourcecontext.SummaryContext{Health: "healthy"} + attachResourceSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + return &resourcecontext.ResourceSummaryContext{Health: "healthy"} }) for i, row := range results { summary, ok := row.(*aicontext.ResourceSummary) @@ -114,15 +114,15 @@ func TestAttachSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { t.Fatalf("row %d: unexpected type %T", i, row) } if summary.SummaryContext != nil { - t.Errorf("row %d: SummaryContext should be nil on length mismatch, got %#v", i, summary.SummaryContext) + t.Errorf("row %d: ResourceSummaryContext should be nil on length mismatch, got %#v", i, summary.SummaryContext) } } } -// TestAttachSummaryContextToUnstructuredList covers the dynamic-CRD +// TestAttachResourceSummaryContextToUnstructuredList covers the dynamic-CRD // path. summarizeUnstructured returns *ResourceSummary so the attach // helper is symmetric with the typed path. -func TestAttachSummaryContextToUnstructuredList(t *testing.T) { +func TestAttachResourceSummaryContextToUnstructuredList(t *testing.T) { items := []*unstructured.Unstructured{ {Object: map[string]any{ "apiVersion": "argoproj.io/v1alpha1", @@ -131,7 +131,7 @@ func TestAttachSummaryContextToUnstructuredList(t *testing.T) { "status": map[string]any{"conditions": []any{map[string]any{"type": "Ready", "status": "True"}}}, }}, } - want := map[string]*resourcecontext.SummaryContext{ + want := map[string]*resourcecontext.ResourceSummaryContext{ "argoproj.io|Application|argocd|storefront": { Health: "healthy", IssueCount: 1, @@ -139,14 +139,14 @@ func TestAttachSummaryContextToUnstructuredList(t *testing.T) { } results := []any{aicontext.MinifyUnstructured(items[0], aicontext.LevelSummary)} - attachSummaryContextToUnstructuredList(results, items, stubBuilder(t, want)) + attachResourceSummaryContextToUnstructuredList(results, items, stubBuilder(t, want)) summary, ok := results[0].(*aicontext.ResourceSummary) if !ok || summary == nil { t.Fatalf("unexpected row type %T", results[0]) } if summary.SummaryContext == nil { - t.Fatalf("SummaryContext not attached") + t.Fatalf("ResourceSummaryContext not attached") } if summary.SummaryContext.Health != "healthy" { t.Errorf("Health = %q, want healthy", summary.SummaryContext.Health) diff --git a/internal/summarycontext/summarycontext.go b/internal/summarycontext/summarycontext.go index 0123a858c..8c29a8acc 100644 --- a/internal/summarycontext/summarycontext.go +++ b/internal/summarycontext/summarycontext.go @@ -1,5 +1,5 @@ // Package summarycontext is the shared core that powers the compact -// SummaryContext attached to /api/ai/resources/{kind} list rows, /api/search +// ResourceSummaryContext attached to /api/ai/resources/{kind} list rows, /api/search // hits, and the MCP list_resources / search variants. // // The REST and MCP wrappers (internal/server, internal/mcp) differ only @@ -24,15 +24,15 @@ import ( "github.com/skyhook-io/radar/pkg/topology" ) -// Builder is the per-request closure that produces a SummaryContext for -// a single resource. nil result is fine — the SummaryContext field is +// Builder is the per-request closure that produces a ResourceSummaryContext for +// a single resource. nil result is fine — the ResourceSummaryContext field is // omitempty on every consumer. // // group is required so the per-resource issue lookup can distinguish // CRDs that share kind+namespace+name across API groups (e.g. Knative // Service vs corev1 Service, or two custom CRDs both named "Cluster" // from different operators). Pass "" for core-group resources. -type Builder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext +type Builder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext // BuilderFromIndexes assembles the per-request closure. The list path // passes the same index for both namespacedIdx and clusterIdx (single- @@ -58,7 +58,7 @@ func BuilderFromIndexes(topo *topology.Topology, namespacedIdx, clusterIdx Issue relIdx = topology.IndexByResource(topo) } - return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.SummaryContext { + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { var managedBy *resourcecontext.ManagedByRef if topo != nil { // Pass the fetched object when available so synthesis is diff --git a/pkg/ai/context/summary.go b/pkg/ai/context/summary.go index 62f53b651..fd27d4c6f 100644 --- a/pkg/ai/context/summary.go +++ b/pkg/ai/context/summary.go @@ -42,40 +42,42 @@ type ResourceSummary struct { Finalizers []string `json:"finalizers,omitempty"` // Type-specific fields (only populated when relevant) - Image string `json:"image,omitempty"` - Ports string `json:"ports,omitempty"` - Schedule string `json:"schedule,omitempty"` - Type string `json:"type,omitempty"` // Service type, Secret type - Selector string `json:"selector,omitempty"` - ClusterIP string `json:"clusterIP,omitempty"` - Hosts []string `json:"hosts,omitempty"` - Restarts int32 `json:"restarts,omitempty"` - Node string `json:"node,omitempty"` - Strategy string `json:"strategy,omitempty"` - Completions string `json:"completions,omitempty"` - Duration string `json:"duration,omitempty"` + Image string `json:"image,omitempty"` + Ports string `json:"ports,omitempty"` + Schedule string `json:"schedule,omitempty"` + Type string `json:"type,omitempty"` // Service type, Secret type + Selector string `json:"selector,omitempty"` + ClusterIP string `json:"clusterIP,omitempty"` + Hosts []string `json:"hosts,omitempty"` + Restarts int32 `json:"restarts,omitempty"` + Node string `json:"node,omitempty"` + Strategy string `json:"strategy,omitempty"` + Completions string `json:"completions,omitempty"` + Duration string `json:"duration,omitempty"` Suspended *bool `json:"suspended,omitempty"` Unschedulable *bool `json:"unschedulable,omitempty"` - Active int `json:"active,omitempty"` - Target string `json:"target,omitempty"` - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas int32 `json:"maxReplicas,omitempty"` - Current int32 `json:"current,omitempty"` - Desired int32 `json:"desired,omitempty"` - Roles []string `json:"roles,omitempty"` - Version string `json:"version,omitempty"` - Pressures []string `json:"pressures,omitempty"` - Keys []string `json:"keys,omitempty"` - StorageClass string `json:"storageClass,omitempty"` - Capacity string `json:"capacity,omitempty"` - AccessModes []string `json:"accessModes,omitempty"` - Owner string `json:"owner,omitempty"` + Active int `json:"active,omitempty"` + Target string `json:"target,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas int32 `json:"maxReplicas,omitempty"` + Current int32 `json:"current,omitempty"` + Desired int32 `json:"desired,omitempty"` + Roles []string `json:"roles,omitempty"` + Version string `json:"version,omitempty"` + Pressures []string `json:"pressures,omitempty"` + Keys []string `json:"keys,omitempty"` + StorageClass string `json:"storageClass,omitempty"` + Capacity string `json:"capacity,omitempty"` + AccessModes []string `json:"accessModes,omitempty"` + Owner string `json:"owner,omitempty"` // SummaryContext is the per-row enrichment attached by AI-facing list // surfaces (REST /api/ai/resources/{kind}, MCP list_resources, search // hits). Populated by handlers post-minify via resourcecontext.BuildSummary; // nil when the caller opted out (?context=none) or when no fields apply. - SummaryContext *resourcecontext.SummaryContext `json:"summaryContext,omitempty"` + // Type is resourcecontext.ResourceSummaryContext — the field name keeps + // the shorter "SummaryContext" form to match the wire JSON tag. + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // summarize dispatches to the appropriate per-type extractor and then diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go index 86ec0ad9e..15302bff0 100644 --- a/pkg/resourcecontext/summary.go +++ b/pkg/resourcecontext/summary.go @@ -34,13 +34,14 @@ type SummaryOptions struct { Health string } -// BuildSummary produces the compact per-result summaryContext attached to -// list_resources, /api/ai/resources/{kind} list, and search hits. +// BuildSummary produces the compact per-result ResourceSummaryContext +// attached to list_resources, /api/ai/resources/{kind} list, and search +// hits. // // Tightly bounded — only the triage fields needed to choose a next hop. // Returns nil when all three fields would be empty so callers can // `omitempty` the entire object on bare results and keep the wire shape minimal. -func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { +func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryContext { health := opts.Health if health == "" { health = deriveHealth(obj) @@ -48,7 +49,7 @@ func BuildSummary(obj runtime.Object, opts SummaryOptions) *SummaryContext { if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { return nil } - return &SummaryContext{ + return &ResourceSummaryContext{ ManagedBy: opts.ManagedBy, Health: health, IssueCount: opts.IssueCount, diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go index cd0045010..96c4979be 100644 --- a/pkg/resourcecontext/summary_test.go +++ b/pkg/resourcecontext/summary_test.go @@ -171,7 +171,7 @@ func TestBuildSummary_DeploymentHealthDuringScaleDown(t *testing.T) { } got := BuildSummary(dep, SummaryOptions{}) if got == nil { - t.Fatal("got nil, want SummaryContext with health=healthy") + t.Fatal("got nil, want ResourceSummaryContext with health=healthy") } if got.Health != "healthy" { t.Errorf("Health = %q, want %q (Spec.Replicas=2 ready, Status.Replicas=4 due to draining)", got.Health, "healthy") diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 15f5cfee7..93dcbd3ca 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -62,20 +62,23 @@ type ContextRef struct { } // ManagedByRef is the compact form of a "managed-by" pointer used in -// SummaryContext (list/search rows). Carries Kind alongside Source so +// ResourceSummaryContext (list/search rows). Carries Kind alongside Source so // consumers can distinguish e.g. a Flux Kustomization from a Flux // HelmRelease without re-parsing the Source string. Intentionally lacks // Group to keep per-row bytes minimal. type ManagedByRef struct { - Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … - Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" + Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … + Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" Name string `json:"name"` Namespace string `json:"namespace,omitempty"` } -// SummaryContext is the per-row enrichment attached to list_resources -// and search hits. Always-on, intentionally minimal (≤ ~60 bytes). -type SummaryContext struct { +// ResourceSummaryContext is the per-row enrichment attached to +// list_resources and search hits. The row-tier companion to +// ResourceContext (the detail-tier enrichment on GET responses) — +// optimised for bulk triage on lists at ≤ ~60 bytes per row. Always-on +// when the caller didn't opt out via context=none. +type ResourceSummaryContext struct { ManagedBy *ManagedByRef `json:"managedBy,omitempty"` Health string `json:"health,omitempty"` IssueCount int `json:"issueCount,omitempty"` diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index 2e891594c..cfce2925c 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -217,10 +217,10 @@ func TestResourceContextRoundTrip(t *testing.T) { } } -// TestSummaryContextRoundTrip covers SummaryContext + ManagedByRef +// TestResourceSummaryContextRoundTrip covers ResourceSummaryContext + ManagedByRef // which are not embedded in ResourceContext. -func TestSummaryContextRoundTrip(t *testing.T) { - orig := SummaryContext{ +func TestResourceSummaryContextRoundTrip(t *testing.T) { + orig := ResourceSummaryContext{ ManagedBy: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, Health: "degraded", IssueCount: 2, @@ -229,7 +229,7 @@ func TestSummaryContextRoundTrip(t *testing.T) { if err != nil { t.Fatalf("marshal: %v", err) } - var got SummaryContext + var got ResourceSummaryContext if err := json.Unmarshal(b, &got); err != nil { t.Fatalf("unmarshal: %v", err) } @@ -244,12 +244,12 @@ func TestSummaryContextRoundTrip(t *testing.T) { s := string(b) for _, sub := range wantSubstr { if !strings.Contains(s, sub) { - t.Errorf("SummaryContext JSON missing %s: %s", sub, s) + t.Errorf("ResourceSummaryContext JSON missing %s: %s", sub, s) } } for _, forbidden := range []string{`"group"`} { if strings.Contains(s, forbidden) { - t.Errorf("SummaryContext JSON leaks %s: %s", forbidden, s) + t.Errorf("ResourceSummaryContext JSON leaks %s: %s", forbidden, s) } } } @@ -258,8 +258,8 @@ func TestSummaryContextRoundTrip(t *testing.T) { // without it, Flux Kustomization vs HelmRelease serialize to identical // JSON, forcing consumers to parse the Source string. func TestManagedByRefDistinguishesFluxKinds(t *testing.T) { - kustomization := SummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} - helmRelease := SummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + kustomization := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + helmRelease := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} kJSON, _ := json.Marshal(kustomization) hJSON, _ := json.Marshal(helmRelease) From 34ad05cbf7421da9f5bc7591f5b0904b31a64eee Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 18:13:56 +0300 Subject: [PATCH 20/33] refactor(summarycontext): centralize attach helpers + group extractors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit groupFromObject, groupFromUnstructured, and the two attachResourceSummaryContext* helpers were byte-identical between internal/server/ai_handlers.go and internal/mcp/tools.go. The internal/summarycontext package exists explicitly to centralize shared per-row builder logic — moved all four there as GroupFromObject, GroupFromUnstructured, AttachToTypedList, and AttachToUnstructuredList. Eliminates the maintenance hazard of two copies drifting. REST and MCP wrappers now share one source of truth alongside the existing Builder type. --- internal/mcp/tools.go | 68 +----------------------- internal/server/ai_handlers.go | 70 +------------------------ internal/server/summary_context_test.go | 6 +-- internal/summarycontext/attach.go | 70 +++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 137 deletions(-) create mode 100644 internal/summarycontext/attach.go diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 88cb0d1fa..3b0680050 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -14,7 +14,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/filter" "github.com/skyhook-io/radar/internal/helm" @@ -526,35 +525,13 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li idxNamespaces = nil } if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachResourceSummaryContextToTyped(results, objs, builder) + summarycontext.AttachToTypedList(results, objs, builder) } } return toJSONResult(results) } -// attachResourceSummaryContextToTyped fills in SummaryContext on each -// Summary-verbosity ResourceSummary in-place. results and objs are -// produced in lockstep by MinifyList — a length mismatch is defensive -// (skip rather than panic). -// -// Group is sourced per-object from each typed object's GVK (SetTypeMeta -// is called by Minify, so apiVersion is reliable here) — passed through -// to the builder so the per-resource issue lookup stays group-aware. -func attachResourceSummaryContextToTyped(results []any, objs []runtime.Object, builder summarycontext.Builder) { - if len(results) != len(objs) { - return - } - for i, row := range results { - summary, ok := row.(*aicontext.ResourceSummary) - if !ok || summary == nil { - continue - } - group := groupFromObject(objs[i]) - summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) - } -} - func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, clusterScoped bool, contextMode string) (*mcp.CallToolResult, any, error) { var rawItems []*unstructured.Unstructured if len(namespaces) > 0 { @@ -588,54 +565,13 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g idxNamespaces = nil } if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachResourceSummaryContextToUnstructured(allItems, rawItems, builder) + summarycontext.AttachToUnstructuredList(allItems, rawItems, builder) } } return toJSONResult(allItems) } -// attachResourceSummaryContextToUnstructured fills in SummaryContext for the -// dynamic-CRD list path. summarizeUnstructured returns -// *aicontext.ResourceSummary, so the cast matches the typed path. -// -// Group comes from each unstructured's apiVersion so two CRDs that share -// kind+ns+name across API groups (e.g. multiple operators each shipping -// a "Cluster" resource) get independent issue counts. -func attachResourceSummaryContextToUnstructured(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { - if len(results) != len(items) { - return - } - for i, row := range results { - summary, ok := row.(*aicontext.ResourceSummary) - if !ok || summary == nil { - continue - } - group := groupFromUnstructured(items[i]) - summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) - } -} - -// groupFromObject extracts the API group from a typed runtime.Object's -// GroupVersionKind. Returns "" for core-group objects (Pod, Service, -// etc.) and when the GVK is unset. -func groupFromObject(obj runtime.Object) string { - if obj == nil { - return "" - } - k8s.SetTypeMeta(obj) - return obj.GetObjectKind().GroupVersionKind().Group -} - -// groupFromUnstructured pulls the API group from an unstructured's -// apiVersion. Mirrors groupFromObject for the dynamic-CRD path. -func groupFromUnstructured(u *unstructured.Unstructured) string { - if u == nil { - return "" - } - return u.GroupVersionKind().Group -} - func handleGetResource(ctx context.Context, req *mcp.CallToolRequest, input getResourceInput) (*mcp.CallToolResult, any, error) { cache := k8s.GetResourceCache() if cache == nil { diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index bdbc7aa35..617397906 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -7,7 +7,6 @@ import ( "github.com/go-chi/chi/v5" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/k8s" "github.com/skyhook-io/radar/internal/summarycontext" @@ -112,7 +111,7 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // Typed list resolves group from each object's TypeMeta — // MinifyList sets it via SetTypeMeta before producing rows, // so we can trust apiVersion on the typed source. - attachResourceSummaryContextToList(results, objs, builder) + summarycontext.AttachToTypedList(results, objs, builder) } } @@ -132,71 +131,6 @@ func issueIndexNamespaces(namespaces []string, kind, group string) []string { return namespaces } -// attachResourceSummaryContextToList walks the typed-cache list and assigns the -// per-row ResourceSummaryContext into each ResourceSummary in-place. results and -// objs are produced in lockstep by MinifyList; a length mismatch is -// defensive (and silently skips attachment rather than panicking) but -// shouldn't occur in practice. -// -// Group is sourced per-object from the typed object's GVK (via SetTypeMeta -// + ObjectKind), so list paths that mix kinds — they don't today, but the -// shape doesn't preclude it — stay correct. -func attachResourceSummaryContextToList(results []any, objs []runtime.Object, builder summarycontext.Builder) { - if len(results) != len(objs) { - return - } - for i, row := range results { - summary, ok := row.(*aicontext.ResourceSummary) - if !ok || summary == nil { - continue - } - group := groupFromObject(objs[i]) - summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) - } -} - -// attachResourceSummaryContextToUnstructuredList does the same for the dynamic -// CRD path. MinifyUnstructured returns *ResourceSummary (Summary level) -// so the cast is the same shape. -// -// Group comes from each unstructured's apiVersion — required for issue- -// index lookups so two CRDs that share kind+ns+name across groups don't -// collide on the per-resource count. -func attachResourceSummaryContextToUnstructuredList(results []any, items []*unstructured.Unstructured, builder summarycontext.Builder) { - if len(results) != len(items) { - return - } - for i, row := range results { - summary, ok := row.(*aicontext.ResourceSummary) - if !ok || summary == nil { - continue - } - group := groupFromUnstructured(items[i]) - summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) - } -} - -// groupFromObject extracts the API group from a typed runtime.Object's -// GroupVersionKind. Returns "" for core-group objects and when the GVK -// is unset (callers should SetTypeMeta first, but we don't panic on -// the missing case). -func groupFromObject(obj runtime.Object) string { - if obj == nil { - return "" - } - k8s.SetTypeMeta(obj) - return obj.GetObjectKind().GroupVersionKind().Group -} - -// groupFromUnstructured pulls the API group from an unstructured's -// apiVersion. Mirrors groupFromObject for the dynamic-CRD path. -func groupFromUnstructured(u *unstructured.Unstructured) string { - if u == nil { - return "" - } - return u.GroupVersionKind().Group -} - // aiListDynamic handles the CRD/dynamic fallback for AI list. func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel, skipContext bool) { var allItems []*unstructured.Unstructured @@ -235,7 +169,7 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 if !skipContext && level == aicontext.LevelSummary { idxNamespaces := issueIndexNamespaces(namespaces, kind, group) if builder := s.newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { - attachResourceSummaryContextToUnstructuredList(results, allItems, builder) + summarycontext.AttachToUnstructuredList(results, allItems, builder) } } diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go index 414a92547..af4fb3f22 100644 --- a/internal/server/summary_context_test.go +++ b/internal/server/summary_context_test.go @@ -65,7 +65,7 @@ func TestAttachResourceSummaryContextToList(t *testing.T) { if err != nil { t.Fatalf("MinifyList: %v", err) } - attachResourceSummaryContextToList(results, objs, stubBuilder(t, want)) + summarycontext.AttachToTypedList(results, objs, stubBuilder(t, want)) // Row 0 — healthy pod. b, _ := json.Marshal(results[0]) @@ -105,7 +105,7 @@ func TestAttachResourceSummaryContextToList_MismatchedLengthsSilent(t *testing.T &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, } // Length mismatch (1 obj vs 2 results) — must not panic, must skip. - attachResourceSummaryContextToList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + summarycontext.AttachToTypedList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { return &resourcecontext.ResourceSummaryContext{Health: "healthy"} }) for i, row := range results { @@ -139,7 +139,7 @@ func TestAttachResourceSummaryContextToUnstructuredList(t *testing.T) { } results := []any{aicontext.MinifyUnstructured(items[0], aicontext.LevelSummary)} - attachResourceSummaryContextToUnstructuredList(results, items, stubBuilder(t, want)) + summarycontext.AttachToUnstructuredList(results, items, stubBuilder(t, want)) summary, ok := results[0].(*aicontext.ResourceSummary) if !ok || summary == nil { diff --git a/internal/summarycontext/attach.go b/internal/summarycontext/attach.go new file mode 100644 index 000000000..3f0ea18ce --- /dev/null +++ b/internal/summarycontext/attach.go @@ -0,0 +1,70 @@ +package summarycontext + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/k8s" +) + +// AttachToTypedList fills in SummaryContext for each *aicontext.ResourceSummary +// row produced from typed runtime.Object items (typed-cache list path). +// results and objs must be parallel slices — length mismatch is treated as a +// caller bug and the function returns without touching the rows. +// +// Group is sourced per-object from the typed object's GVK via SetTypeMeta + +// GetObjectKind, so list paths that mix kinds stay correct. +func AttachToTypedList(results []any, objs []runtime.Object, builder Builder) { + if len(results) != len(objs) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromObject(objs[i]) + summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// AttachToUnstructuredList is the dynamic-CRD counterpart of +// AttachToTypedList. Group comes from each item's apiVersion so two CRDs that +// share kind+ns+name across API groups (e.g. multiple operators each shipping +// a "Cluster" resource) get independent issue counts. +func AttachToUnstructuredList(results []any, items []*unstructured.Unstructured, builder Builder) { + if len(results) != len(items) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromUnstructured(items[i]) + summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// GroupFromObject extracts the API group from a typed runtime.Object's +// GroupVersionKind. Returns "" for core-group objects (Pod, Service, etc.) +// and when the GVK is unset. Calls k8s.SetTypeMeta so the GVK is populated +// from scheme metadata when the object came out of the typed cache without +// it set. +func GroupFromObject(obj runtime.Object) string { + if obj == nil { + return "" + } + k8s.SetTypeMeta(obj) + return obj.GetObjectKind().GroupVersionKind().Group +} + +// GroupFromUnstructured pulls the API group from an unstructured's apiVersion. +// Mirrors GroupFromObject for the dynamic-CRD path. +func GroupFromUnstructured(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + return u.GroupVersionKind().Group +} From ac10b5bab192b49c61f95b4f679c19c189873e64 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 21:30:20 +0300 Subject: [PATCH 21/33] fix(summarycontext): drop kindFilter so CRD plurals don't zero issueCount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BuildIssueIndex used to set filters.Kinds = [CanonicalSingular(kindFilter)] on the Compose call. CanonicalSingular only knows built-in plurals, so a CRD listed by its plural form (e.g. ArgoCD "applications") fell through unchanged. Compose's case-insensitive Kind filter then failed against the singular "Application" the issue engine emits, and every CRD row's issueCount silently became 0. The MCP tool description encourages plural forms ("e.g. pods, deployments"), so this hit the common path for agents inspecting CRDs. Fix: drop the Kinds filter entirely. The per-row bucketing in issueIndexKey(group, kind, ns, name) already discriminates correctly because the lookup side runs the row's singular Kind through CanonicalSingular too — index and query agree without needing a Compose-time filter. Per-namespace issue volumes are tiny (~tens to low hundreds), so the extra bucket work is negligible. Removed `kindFilter` from BuildIssueIndex's signature and from both newResourceSummaryContextBuilder wrappers (REST + MCP) since neither needs it anymore. Added TestBuildIssueIndex_CRDPlural_NonZeroCount to pin the contract. --- internal/mcp/summary_context.go | 12 ++--- internal/mcp/tools.go | 4 +- internal/server/ai_handlers.go | 4 +- internal/server/summary_context.go | 8 +-- internal/summarycontext/summarycontext.go | 18 ++++--- .../summarycontext/summarycontext_test.go | 51 ++++++++++++++++--- 6 files changed, 69 insertions(+), 28 deletions(-) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go index 85ff7b122..78f36cf0f 100644 --- a/internal/mcp/summary_context.go +++ b/internal/mcp/summary_context.go @@ -24,20 +24,18 @@ import ( // rather than emit empty objects. // // namespaces scopes the issue index to just the rows being returned; -// pass nil for cluster-wide. kindFilter ("" for search, the requested -// kind for list_resources) narrows the issue compose to a single kind -// so list_resources kind=pod doesn't pull deployment + service issues. +// pass nil for cluster-wide. // // Use newSearchSummaryContextBuilder for MCP search, which routes // per-hit between a namespaced and a cluster-wide index — search // returns mixed kinds in one response, so a single index can't get // both right. -func newResourceSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { +func newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - idx := summarycontext.BuildIssueIndex(provider, namespaces, kindFilter) + idx := summarycontext.BuildIssueIndex(provider, namespaces) return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(namespaces), idx, idx) } @@ -53,10 +51,10 @@ func newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Buil if provider == nil { return nil } - namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces, "") + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) clusterIdx := namespacedIdx if scanNamespaces != nil { - clusterIdx = summarycontext.BuildIssueIndex(provider, nil, "") + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) } return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(scanNamespaces), namespacedIdx, clusterIdx) } diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 3b0680050..7852a106a 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -524,7 +524,7 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li if clusterScoped { idxNamespaces = nil } - if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { summarycontext.AttachToTypedList(results, objs, builder) } } @@ -564,7 +564,7 @@ func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, g if clusterScoped { idxNamespaces = nil } - if builder := newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { summarycontext.AttachToUnstructuredList(allItems, rawItems, builder) } } diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 617397906..d94976a02 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -107,7 +107,7 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { // so we pass nil here to compose cluster-wide. if !skipContext && level == aicontext.LevelSummary { idxNamespaces := issueIndexNamespaces(namespaces, kind, group) - if builder := s.newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { // Typed list resolves group from each object's TypeMeta — // MinifyList sets it via SetTypeMeta before producing rows, // so we can trust apiVersion on the typed source. @@ -168,7 +168,7 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 if !skipContext && level == aicontext.LevelSummary { idxNamespaces := issueIndexNamespaces(namespaces, kind, group) - if builder := s.newResourceSummaryContextBuilder(idxNamespaces, kind); builder != nil { + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { summarycontext.AttachToUnstructuredList(results, allItems, builder) } } diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go index ad227c75b..6853930a1 100644 --- a/internal/server/summary_context.go +++ b/internal/server/summary_context.go @@ -27,12 +27,12 @@ import ( // Use newSearchSummaryContextBuilder for search, which routes per-hit // between a namespaced and a cluster-wide index — search returns mixed // kinds in one response, so a single index can't get both right. -func (s *Server) newResourceSummaryContextBuilder(namespaces []string, kindFilter string) summarycontext.Builder { +func (s *Server) newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { provider := issues.NewCacheProvider() if provider == nil { return nil } - idx := summarycontext.BuildIssueIndex(provider, namespaces, kindFilter) + idx := summarycontext.BuildIssueIndex(provider, namespaces) return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), idx, idx) } @@ -61,10 +61,10 @@ func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summary if provider == nil { return nil } - namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces, "") + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) clusterIdx := namespacedIdx if scanNamespaces != nil { - clusterIdx = summarycontext.BuildIssueIndex(provider, nil, "") + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) } return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), namespacedIdx, clusterIdx) } diff --git a/internal/summarycontext/summarycontext.go b/internal/summarycontext/summarycontext.go index 8c29a8acc..e375e6ab2 100644 --- a/internal/summarycontext/summarycontext.go +++ b/internal/summarycontext/summarycontext.go @@ -176,17 +176,21 @@ func CanonicalSingular(kind string) string { // + "condition" only. Audit + Warning events are loud and require // explicit opt-in; rolling them into the per-row count would distort // "this Pod has 1 issue" for the common case. -func BuildIssueIndex(p issues.Provider, namespaces []string, kindFilter string) IssueIndex { +// +// No Kinds filter on Compose: the index buckets every composed row by +// (group, kind, ns, name), and the per-row lookup keys off +// issueIndexKey(...) with the same canonicalization, so kind-mismatched +// rows simply never read. Filtering Compose itself by Kind would need +// CRD-plural awareness — CanonicalSingular handles built-ins but +// returns CRD plurals (e.g. "applications") unchanged, and the issue +// engine emits "Application", silently zeroing issueCount on every CRD +// row. Bucketing is O(N) over the at-most-namespace-bounded issue set, +// which the consumer materialises anyway. +func BuildIssueIndex(p issues.Provider, namespaces []string) IssueIndex { filters := issues.Filters{ Namespaces: namespaces, Limit: issues.NoLimit, } - if kindFilter != "" { - // Compose's Kinds filter expects the singular kind ("Pod"). The - // caller may pass either the URL plural ("pods") or the singular — - // CanonicalSingular normalizes both before issuing the filter. - filters.Kinds = []string{CanonicalSingular(kindFilter)} - } composed := issues.Compose(p, filters) idx := make(IssueIndex, len(composed)) for _, iss := range composed { diff --git a/internal/summarycontext/summarycontext_test.go b/internal/summarycontext/summarycontext_test.go index b91c0a7e5..e1b68769d 100644 --- a/internal/summarycontext/summarycontext_test.go +++ b/internal/summarycontext/summarycontext_test.go @@ -105,7 +105,7 @@ func TestBuildIssueIndex_GroupAware(t *testing.T) { {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, }, } - idx := BuildIssueIndex(p, nil, "") + idx := BuildIssueIndex(p, nil) if got := idx.Count("", "Service", "prod", "api"); got != 1 { t.Errorf("core Service count = %d, want 1", got) } @@ -128,7 +128,7 @@ func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { }) } p := &fakeIssuesProvider{problems: probs} - idx := BuildIssueIndex(p, nil, "") + idx := BuildIssueIndex(p, nil) tailName := fmtPodName(issues.MaxLimit + 25) if got := idx.Count("", "Pod", "prod", tailName); got != 1 { t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) @@ -172,7 +172,7 @@ func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) } // Cluster-wide compose (nil namespaces) — issue surfaces. - idx := BuildIssueIndex(p, nil, "Node") + idx := BuildIssueIndex(p, nil) if got := idx.Count("", "Node", "", "worker-1"); got != 1 { t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) } @@ -181,12 +181,51 @@ func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) // ["prod","staging"] drops it because the user-namespaced perm // slice never matches "". This is what the pre-fix handler did for // Node lists. - scopedIdx := BuildIssueIndex(p, []string{"prod", "staging"}, "Node") + scopedIdx := BuildIssueIndex(p, []string{"prod", "staging"}) if got := scopedIdx.Count("", "Node", "", "worker-1"); got != 0 { t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) } } +// TestBuildIssueIndex_CRDPlural_NonZeroCount pins the fix for a Bugbot +// finding on PR #722: a CRD listed by its plural form (e.g. +// "applications" for ArgoCD Application) silently returned +// issueCount=0 because BuildIssueIndex used to push the URL kind +// through CanonicalSingular into filters.Kinds. CanonicalSingular only +// covers built-in plurals — CRD plurals fell through unchanged +// ("applications" stayed "applications"), Compose's case-insensitive +// Kind filter then failed against the singular "Application" the +// issue engine emits, and every CRD row's count was zero. We dropped +// the Kinds filter entirely: bucketing by issueIndexKey(group, kind, +// ns, name) is already correct because the lookup side runs through +// CanonicalSingular too. Per-resource lookup uses the row's singular +// Kind (Pascal "Application") so the index and the query agree. +func TestBuildIssueIndex_CRDPlural_NonZeroCount(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Application", Group: "argoproj.io", Namespace: "argocd", Name: "storefront", Reason: "SyncFailed", Severity: "critical"}, + }, + } + + // Pre-fix simulation: the handler would have passed kindFilter="applications" + // — the URL plural. We no longer take a kindFilter, but verify that + // the index contains the row keyed by the canonical singular form. + idx := BuildIssueIndex(p, []string{"argocd"}) + if got := idx.Count("argoproj.io", "Application", "argocd", "storefront"); got != 1 { + t.Errorf("CRD Application count (singular kind) = %d, want 1", got) + } + // Also pin the URL-form lookup path: the per-row Builder is called + // with the kind as returned by MinifyUnstructured, which for CRDs + // is the singular ("Application"). If a caller ever pushed the + // plural ("applications") through Count(), CanonicalSingular won't + // normalize unknown CRD plurals — that's a separate latent issue + // that doesn't manifest today because the row source uses the + // singular. Document the asymmetry explicitly. + if got := idx.Count("argoproj.io", "applications", "argocd", "storefront"); got != 0 { + t.Errorf("CRD lookup via plural = %d, want 0 (CanonicalSingular only normalizes built-ins; row source uses singular Kind, so lookup matches via singular path)", got) + } +} + // TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end // shape used by /api/search and MCP search: scanNamespaces is non-nil // (a namespace-restricted user, or a user with a `ns:` query modifier), @@ -204,8 +243,8 @@ func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { } // Build the two indexes the search constructor would build. - namespacedIdx := BuildIssueIndex(p, []string{"prod"}, "") - clusterIdx := BuildIssueIndex(p, nil, "") + namespacedIdx := BuildIssueIndex(p, []string{"prod"}) + clusterIdx := BuildIssueIndex(p, nil) // Sanity: pre-fix, the search handler passed namespacedIdx for // both; Node issueCount silently zeroed. From 322d386ce5b7f857c938c8fc670d225364ef723d Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 17:38:22 +0300 Subject: [PATCH 22/33] fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two residual bugs on the AI GET resourceContext path were dropping signal silently: 1) computeIssueSummaryForResource was called with the URL-plural kind ("deployments"), which the issues composer's Filters.Kinds matcher case-folds but does NOT plural-to-singular convert. Issue.Kind is the canonical Pascal singular ("Deployment"), so every issue got filtered out and IssueSummary.Count silently collapsed to 0 (Build then omits the field entirely). Fix: pass canonicalKind (derived from obj.GVK) into the function — same pattern computeAuditSummaryForResource was already using. 2) GetRelationshipsWithObject was called with the URL-plural kind, which buildNodeID resolved to the wrong topology node for cross-group CRDs whose Kind collides with a core kind. For a Knative serving.knative.dev Service request, "services" → "service/ns/name" picked the CORE Service node instead of "knativeservice/ns/name", so relationship walks returned the core Service's edges. Same shape for CAPI Cluster ("capicluster") and Istio Gateway ("istiogateway"). Added topology.KindForGVK as a small exported helper that maps (kind, group) → pseudo-kind for the three known cross-group collisions; the handler now funnels gvk through it before the relationship lookup. Non-colliding kinds (core, apps, batch, Gateway API, etc.) pass through unchanged so buildNodeID's existing kindMap handles them. Tests: - pkg/topology/pseudokinds_test.go: table tests covering every remap case plus the pass-throughs (including the wrong-group-same-kind guards: Service under argoproj.io, Route under route.openshift.io). - internal/server/ai_handlers_group_test.go: * TestAIGetResource_GroupRoutesRelationshipsToKnative — seeds a Knative Service in the dynamic cache co-named with the core Service plus an Ingress backend-ref'd to the core Service. Asserts the knative-routed response does NOT leak the core Service's Ingress into resourceContext.exposes (locked the regression to fail without the KindForGVK funnel). * TestAIGetResource_IssueSummaryCountsURLPluralKind — asserts a broken Deployment (UnavailableReplicas=3) surfaces with count > 0 in resourceContext.issueSummary when fetched via the URL plural. Fixture additions in TestMain: - broken/stuck-app Deployment (UnavailableReplicas=3): seeds a known problem for the issue-summary regression test, in its own namespace so it doesn't perturb default-namespace smoke tests. - default/nginx-ingress Ingress routing to Service "nginx": the distinguishing edge between core Service and Knative Service topology nodes. --- internal/server/ai_handlers.go | 20 ++- internal/server/ai_handlers_group_test.go | 183 ++++++++++++++++++++++ internal/server/server_smoke_test.go | 67 ++++++++ pkg/topology/pseudokinds.go | 48 ++++++ pkg/topology/pseudokinds_test.go | 61 ++++++++ 5 files changed, 377 insertions(+), 2 deletions(-) create mode 100644 pkg/topology/pseudokinds.go create mode 100644 pkg/topology/pseudokinds_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ebab36f3..f359904cd 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -308,7 +308,7 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin canonicalKind = kind } - issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) + issueSum := computeIssueSummaryForResource(cache, canonicalKind, namespace, name) auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) opts := resourcecontext.Options{ @@ -335,8 +335,17 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin // vs core/v1 Service). idx=nil is fine for single-resource: the // per-call inline scan is O(E) once. Bulk callers (T12/T89) should // build a shared index via topology.IndexByResource(topo). + // + // Route through KindForGVK so cross-group CRDs whose Kind collides + // with a core kind (Knative {Service, Configuration, Revision, Route}, + // CAPI Cluster, Istio Gateway) resolve to the right topology node. + // The builder writes those as pseudo-kinds (e.g. "knativeservice/...") + // — without remapping, buildNodeID would resolve "services/..." to + // the core Service node and walk the wrong edges. + gvk := obj.GetObjectKind().GroupVersionKind() + relKind := topology.KindForGVK(gvk.Kind, gvk.Group) opts.Relationships = topology.GetRelationshipsWithObject( - kind, namespace, name, obj, topo, prov, dyn, nil, + relKind, namespace, name, obj, topo, prov, dyn, nil, ) } @@ -379,6 +388,13 @@ func (s *Server) topologyForContext(namespace string) (*topology.Topology, topol // is done client-side; the composer's native namespace filter restricts the // scan to the resource's namespace so we don't walk the whole cluster. // +// kind MUST be the Pascal singular form the issue composer writes into +// Issue.Kind (e.g. "Deployment", "Pod") — the caller derives it from obj's +// TypeMeta. The composer's Filters.Kinds matcher case-folds both sides, but +// it does NOT plural-to-singular convert, so URL forms ("deployments", +// "pods") drop every issue ("deployments" != lower("Deployment")) and the +// summary silently collapses to nil. +// // Returns nil when no issues match — Build then omits the IssueSummary field. func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { if cache == nil { diff --git a/internal/server/ai_handlers_group_test.go b/internal/server/ai_handlers_group_test.go index da8891066..b6e0aabbe 100644 --- a/internal/server/ai_handlers_group_test.go +++ b/internal/server/ai_handlers_group_test.go @@ -124,6 +124,189 @@ func TestAIGetResource_GroupRoutesToDynamic(t *testing.T) { } } +// Group-qualified AI GET must also route the topology relationship lookup +// to the matching pseudo-kind node. The bug: handleAIGetResource passed the +// URL plural "services" straight into topology.GetRelationshipsWithObject, +// which feeds buildNodeID — and buildNodeID's kindMap resolves "services" +// to "service", landing on the CORE Service's topology node. For a Knative +// Service request, the response then carried the core Service's incoming +// Ingress edge as resourceContext.exposes, which is provably wrong. +// +// Fix: derive a topology-pseudo-kind via topology.KindForGVK(gvk.Kind, +// gvk.Group) — for Knative Service, that yields "knativeservice", whose +// node has no Ingress edge in this fixture and therefore no Exposes. +// +// Differentiator: the TestMain fixture seeds an Ingress backend-ref'd to +// the core Service "nginx" in "default". The Knative Service "nginx" in +// "default" (seeded below into the dynamic cache) is a separate topology +// node with NO incoming Ingress edges. The test asserts that the +// resourceContext returned for the ?group=serving.knative.dev request +// does NOT advertise that Ingress — the same fixture, when queried +// without ?group, DOES surface it (locked down by the trailing sub-test +// to pin the regression's pre-fix shape and prevent a future change that +// silently drops the core-side relationship as well). +func TestAIGetResource_GroupRoutesRelationshipsToKnative(t *testing.T) { + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + // The Knative Service request MUST NOT inherit the core Service's + // Ingress in resourceContext.exposes. Pre-fix, the URL "services" was + // passed into buildNodeID and resolved to "service/default/nginx" — + // the wrong topology node — so the Ingress leaked. + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev") + if err != nil { + t.Fatalf("GET (knative): %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var knBody map[string]any + if err := json.NewDecoder(resp.Body).Decode(&knBody); err != nil { + t.Fatalf("decode (knative): %v", err) + } + knRC, _ := knBody["resourceContext"].(map[string]any) + if knRC == nil { + t.Fatal("knative response missing resourceContext") + } + exposes, _ := knRC["exposes"].([]any) + for _, e := range exposes { + em, _ := e.(map[string]any) + kind, _ := em["kind"].(string) + name, _ := em["name"].(string) + if kind == "Ingress" && name == "nginx-ingress" { + t.Fatalf("knative-routed request leaked the core Service's Ingress into resourceContext.exposes "+ + "(got %+v) — relationship lookup did NOT remap to the knativeservice topology node; "+ + "check that handleAIGetResource is funneling kind through topology.KindForGVK", exposes) + } + } + + // Co-anchored sibling: when no ?group is passed, the same path resolves + // to the core Service node and MUST still surface the Ingress. This + // half guards against an over-correction that nukes the relationship + // lookup for the dominant typed-cache case while fixing the CRD case. + respCore, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx") + if err != nil { + t.Fatalf("GET (core): %v", err) + } + defer respCore.Body.Close() + var coreBody map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreBody); err != nil { + t.Fatalf("decode (core): %v", err) + } + coreRC, _ := coreBody["resourceContext"].(map[string]any) + coreExposes, _ := coreRC["exposes"].([]any) + foundIngress := false + for _, e := range coreExposes { + em, _ := e.(map[string]any) + if em["kind"] == "Ingress" && em["name"] == "nginx-ingress" { + foundIngress = true + break + } + } + if !foundIngress { + t.Errorf("core Service request lost the Ingress from resourceContext.exposes (got %+v) — "+ + "the fix overshot and broke the typed-cache relationship lookup", coreExposes) + } +} + +// Pin Finding 1: the AI GET handler used to pass the URL-plural kind +// ("deployments") into computeIssueSummaryForResource, which forwards +// it to issues.Compose via Filters.Kinds. The composer's applyFilters +// case-folds both sides (strings.ToLower) but does NOT plural-to-singular +// convert — and Issue.Kind is the canonical Pascal singular ("Deployment"). +// So the filter set {"deployments"} never matched lower("Deployment") = +// "deployment", every issue got dropped, and IssueSummary.Count silently +// collapsed to 0 (Build then omits the field entirely). +// +// Fix: pass canonicalKind (derived from obj.GVK) into +// computeIssueSummaryForResource so the filter is "Deployment" → matched. +// +// Fixture: TestMain seeds Deployment broken/stuck-app with +// UnavailableReplicas=3. DetectProblems emits a Pascal-singular +// "Deployment" problem for it. Hitting /api/ai/resources/deployments/... +// (URL plural) must surface the issue in resourceContext.issueSummary +// with count > 0 — pre-fix this came back as null. +func TestAIGetResource_IssueSummaryCountsURLPluralKind(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/deployments/broken/stuck-app") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + rc, _ := body["resourceContext"].(map[string]any) + if rc == nil { + t.Fatal("response missing resourceContext") + } + issueSum, _ := rc["issueSummary"].(map[string]any) + if issueSum == nil { + t.Fatalf("resourceContext.issueSummary is nil — composer filter dropped every issue. "+ + "Likely the handler is still passing URL-plural kind ('deployments') into "+ + "computeIssueSummaryForResource instead of canonical Pascal singular ('Deployment'). "+ + "Got: %+v", rc) + } + count, _ := issueSum["count"].(float64) + if count < 1 { + t.Fatalf("issueSummary.count = %v, want >= 1 — DetectProblems should have flagged "+ + "the broken/stuck-app Deployment (UnavailableReplicas=3)", count) + } +} + // Happy-path sibling for the test above: when no group is passed, the // typed-cache-first path is correct (and must continue to be — the v1 // core Service is the dominant case and must not pay a dynamic-cache diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 90dcbef37..21fb3e216 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -11,7 +11,11 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" +<<<<<<< HEAD rbacv1 "k8s.io/api/rbac/v1" +======= + networkingv1 "k8s.io/api/networking/v1" +>>>>>>> b01d112 (fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup) metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -28,6 +32,7 @@ var ( func TestMain(m *testing.M) { replicas := int32(1) + brokenReplicas := int32(3) deployUID := "deploy-uid-1234" rsUID := "rs-uid-5678" @@ -37,6 +42,37 @@ func TestMain(m *testing.M) { ObjectMeta: metav1.ObjectMeta{Name: "default"}, Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, }, + &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "broken"}, + Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, + }, + // Broken Deployment in its own namespace so it doesn't perturb the + // "default" fixture used by every other smoke test. Used by + // TestAIGetResource_IssueSummaryCountsURLPluralKind to assert the + // composer's URL-plural-kind filter actually matches the canonical + // Pascal-singular Issue.Kind values — pre-fix, count was 0. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stuck-app", + Namespace: "broken", + Labels: map[string]string{"app": "stuck"}, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &brokenReplicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "stuck"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "stuck"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "stuck", Image: "registry.example/stuck:1"}}}, + }, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 3, + AvailableReplicas: 0, + UnavailableReplicas: 3, + }, + }, &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "nginx", @@ -137,6 +173,37 @@ func TestMain(m *testing.M) { Ports: []corev1.ServicePort{{Port: 80, TargetPort: intstr.FromInt(80)}}, }, }, + // Ingress routing to the core Service "nginx". Used by + // TestAIGetResource_GroupRoutesRelationshipsToKnative to give the + // core Service a distinct incoming edge (EdgeRoutesTo) that the + // Knative Service node does NOT inherit — the test compares whether + // the AI GET handler picks up that edge under ?group=serving.knative.dev + // (regression for the kind-passed-to-relationship-lookup bug). + &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-ingress", + Namespace: "default", + }, + Spec: networkingv1.IngressSpec{ + Rules: []networkingv1.IngressRule{{ + Host: "nginx.example.com", + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{{ + Path: "/", + PathType: func() *networkingv1.PathType { p := networkingv1.PathTypePrefix; return &p }(), + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: "nginx", + Port: networkingv1.ServiceBackendPort{Number: 80}, + }, + }, + }}, + }, + }, + }}, + }, + }, // Seed Secrets in two namespaces so per-user RBAC tests can // distinguish "gate denied → []" from "no secrets in cache" and can // exercise the partial-allow case (one ns allowed, the other denied). diff --git a/pkg/topology/pseudokinds.go b/pkg/topology/pseudokinds.go new file mode 100644 index 000000000..f1f8e500f --- /dev/null +++ b/pkg/topology/pseudokinds.go @@ -0,0 +1,48 @@ +package topology + +// KindForGVK maps a (kind, group) pair to the topology-internal pseudo-kind +// the builder uses for node IDs. The topology builder synthesizes pseudo-kinds +// for a handful of CRDs whose Kind collides with a core kind under a different +// API group — these collisions would otherwise produce ambiguous node IDs. +// +// Callers that already hold the resource's apiVersion (i.e., obj.GVK) and want +// to look up the matching topology node MUST funnel kind through this helper, +// otherwise buildNodeID would resolve to the core node and return relationships +// for the wrong object. +// +// Today the cross-group collisions are: +// +// serving.knative.dev/Service → "knativeservice" +// serving.knative.dev/Configuration → "knativeconfiguration" +// serving.knative.dev/Revision → "knativerevision" +// serving.knative.dev/Route → "knativeroute" +// cluster.x-k8s.io/Cluster → "capicluster" +// networking.istio.io/Gateway → "istiogateway" +// +// For any other (kind, group) pair — including core kinds with group=="" and +// non-colliding CRDs — KindForGVK returns kind unchanged. buildNodeID's own +// kindMap then handles URL-plural-to-singular flattening. +func KindForGVK(kind, group string) string { + switch group { + case "serving.knative.dev": + switch kind { + case "Service": + return "knativeservice" + case "Configuration": + return "knativeconfiguration" + case "Revision": + return "knativerevision" + case "Route": + return "knativeroute" + } + case "cluster.x-k8s.io": + if kind == "Cluster" { + return "capicluster" + } + case "networking.istio.io": + if kind == "Gateway" { + return "istiogateway" + } + } + return kind +} diff --git a/pkg/topology/pseudokinds_test.go b/pkg/topology/pseudokinds_test.go new file mode 100644 index 000000000..91786564e --- /dev/null +++ b/pkg/topology/pseudokinds_test.go @@ -0,0 +1,61 @@ +package topology + +import "testing" + +// KindForGVK is the bridge between (obj.Kind, obj.Group) and the topology +// builder's pseudo-kind node-ID prefix. The builder emits pseudo-kinds for +// CRDs whose Kind collides with a core kind under a different group +// (Knative Service vs core Service, CAPI Cluster vs… nothing today but a +// future "Cluster" core kind, Istio Gateway vs Gateway API Gateway). +// +// A regression in this helper silently routes single-resource relationship +// lookups for those CRDs to the wrong topology node, so the table covers +// every group remapping plus the pass-through cases. +func TestKindForGVK(t *testing.T) { + tests := []struct { + name string + kind string + group string + want string + }{ + // Knative Serving collisions. + {"knative service", "Service", "serving.knative.dev", "knativeservice"}, + {"knative configuration", "Configuration", "serving.knative.dev", "knativeconfiguration"}, + {"knative revision", "Revision", "serving.knative.dev", "knativerevision"}, + {"knative route", "Route", "serving.knative.dev", "knativeroute"}, + // CAPI collision (Cluster, distinct from any future "Cluster" core kind). + {"capi cluster", "Cluster", "cluster.x-k8s.io", "capicluster"}, + // Istio Gateway collision (vs Gateway API's gateway.networking.k8s.io/Gateway). + {"istio gateway", "Gateway", "networking.istio.io", "istiogateway"}, + + // Pass-through: core kinds (group == ""). + {"core service passthrough", "Service", "", "Service"}, + {"core pod passthrough", "Pod", "", "Pod"}, + // Pass-through: apps group. + {"apps deployment passthrough", "Deployment", "apps", "Deployment"}, + {"batch job passthrough", "Job", "batch", "Job"}, + // Pass-through: Gateway API (uses the gateway.networking.k8s.io group, + // distinct from networking.istio.io — must NOT be remapped to istiogateway). + {"gateway api gateway passthrough", "Gateway", "gateway.networking.k8s.io", "Gateway"}, + // Pass-through: non-colliding CRDs. + {"argo application passthrough", "Application", "argoproj.io", "Application"}, + {"cert-manager certificate passthrough", "Certificate", "cert-manager.io", "Certificate"}, + // Pass-through: a Kind that matches a Knative collision but under the + // wrong group must NOT remap. Guards against accidental kind-only + // matching that would mis-classify e.g. core Route or future CRDs. + {"route under wrong group", "Route", "route.openshift.io", "Route"}, + {"service under wrong group", "Service", "argoproj.io", "Service"}, + // Empty kind: pass-through (caller's problem to validate). + {"empty kind", "", "serving.knative.dev", ""}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := KindForGVK(tc.kind, tc.group) + if got != tc.want { + t.Errorf("KindForGVK(%q, %q) = %q, want %q", tc.kind, tc.group, got, tc.want) + } + }) + } +} From 9f7887cd40c2b0bedeeb79cf1a73deb334cdbcfb Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 12:21:31 +0300 Subject: [PATCH 23/33] chore(resourcecontext): drop Hints prose projection from v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the prose `Hints []string` field, the `EmitHints` option, the SynthesizeHints generator, and all associated tests. ~430 LoC of net removal. Rationale: our dominant agent consumer (Claude-class) composes triage prose from the structured fields trivially. The pre-baked Hints array added wire bytes + prompt tokens with no net signal for that consumer. Once shipped, agents pattern-matching on Hint substrings would have ossified the wording. The structured fields (ManagedBy, Exposes, SelectedBy, Uses, RunsOn, ScaledBy, IssueSummary, AuditSummary, PolicySummary, Omitted) carry every fact a derived prose line would encode — agents that need narrative can compose it. If a real consumer emerges that needs deterministic prose, add it as a separate `explain_resource` MCP tool. Keeping it inline was a premature bet against asymmetric costs: easy to add later, hard to remove or evolve once agents depend on the strings. The doc comment on ResourceContext records the decision so future readers don't re-introduce the projection without revisiting the tradeoff. --- internal/server/ai_handlers.go | 5 +- pkg/resourcecontext/build.go | 9 -- pkg/resourcecontext/build_test.go | 61 -------- pkg/resourcecontext/hints.go | 241 ------------------------------ pkg/resourcecontext/hints_test.go | 118 --------------- pkg/resourcecontext/types.go | 12 +- pkg/resourcecontext/types_test.go | 3 - 7 files changed, 9 insertions(+), 440 deletions(-) delete mode 100644 pkg/resourcecontext/hints.go delete mode 100644 pkg/resourcecontext/hints_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index f359904cd..a188a92e1 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -314,7 +314,6 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, AccessChecker: s.newRequestScopedChecker(r), - EmitHints: true, IssueSummary: issueSum, AuditSummary: auditSum, } @@ -463,8 +462,8 @@ func composeSeverityRank(s issues.Severity) int { // // TopFinding is selected deterministically: highest severity wins, with // CheckID as the ascending tiebreaker. Map iteration ordering does NOT -// influence the choice — relevant because SynthesizeHints downstream -// advertises deterministic output. +// influence the choice — agents pinning regression tests on +// resourceContext output rely on stable field values across runs. func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { if cache == nil || kind == "" { return nil diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index e8ef0f93a..4f294a3bb 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -59,10 +59,6 @@ type Options struct { IssueSummary *IssueSummary AuditSummary *AuditSummary PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings - - // EmitHints controls whether SynthesizeHints runs over the structured - // fields. AI-facing callers (MCP, /api/ai/*) set true; UI callers false. - EmitHints bool } // PolicyReportLookup is the minimal interface Build needs from the @@ -238,11 +234,6 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } } - // 6. Hints — AI-only. - if opts.EmitHints { - rc.Hints = SynthesizeHints(rc, opts.Tier) - } - rc.Omitted = omitted.collect() return rc } diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 3a71b81ec..d14c474c9 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -3,7 +3,6 @@ package resourcecontext import ( "context" "encoding/json" - "reflect" "testing" appsv1 "k8s.io/api/apps/v1" @@ -132,7 +131,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, IssueSummary: &IssueSummary{ Count: 1, HighestSeverity: "critical", TopReason: "ImagePullBackOff", BySource: map[string]int{"problem": 1}, @@ -203,20 +201,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) } - // Hints: deterministic ordering covers the high-signal fields. - wantHints := []string{ - "Managed by Application storefront", - "1 issue (critical: ImagePullBackOff)", - "Running on node node-1", - "Exposed by 1 Service", - "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", - "Scaled by 1 HorizontalPodAutoscaler", - "Uses 2 ConfigMaps, 2 Secrets, 1 PVC, ServiceAccount web-sa", - } - if !reflect.DeepEqual(rc.Hints, wantHints) { - t.Errorf("Hints mismatch.\n got: %v\nwant: %v", rc.Hints, wantHints) - } - // Pre-computed summaries are passed through. if rc.IssueSummary == nil || rc.IssueSummary.Count != 1 { t.Errorf("IssueSummary not passed through: %+v", rc.IssueSummary) @@ -243,7 +227,6 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { rc := Build(context.Background(), dep, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, - EmitHints: true, }) if rc == nil { t.Fatal("Build returned nil") @@ -258,10 +241,6 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { if mb.Group != "helm.toolkit.fluxcd.io" { t.Errorf("ManagedBy[0].Group: got %q", mb.Group) } - wantHint := "Managed by HelmRelease web" - if len(rc.Hints) == 0 || rc.Hints[0] != wantHint { - t.Errorf("first Hint: got %v want %q", rc.Hints, wantHint) - } } func TestBuild_Service_ExposedByIngress(t *testing.T) { @@ -281,7 +260,6 @@ func TestBuild_Service_ExposedByIngress(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if got, want := len(rc.Exposes), 1; got != want { @@ -317,7 +295,6 @@ func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if rc == nil { t.Fatal("Build returned nil") @@ -354,7 +331,6 @@ func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if got, want := len(rc.ManagedBy), 1; got != want { t.Fatalf("ManagedBy len: got %d want %d", got, want) @@ -382,7 +358,6 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { rc := Build(context.Background(), pod, Options{ Tier: TierBasic, AccessChecker: denyChecker{group: "", kind: "Secret", namespace: "prod"}, - EmitHints: true, }) if rc.Uses != nil && len(rc.Uses.Secrets) != 0 { t.Errorf("Secrets should be empty after deny; got %+v", rc.Uses.Secrets) @@ -399,30 +374,6 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { } } -func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { - // Flux Helm labels — detected from obj metadata directly via - // topology.SynthesizeManagedBy without needing a populated Topology. - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", - Labels: map[string]string{ - "helm.toolkit.fluxcd.io/name": "web", - "helm.toolkit.fluxcd.io/namespace": "flux-system", - }}, - } - rc := Build(context.Background(), dep, Options{ - Tier: TierBasic, - AccessChecker: allowAllChecker{}, - EmitHints: false, - }) - if len(rc.Hints) != 0 { - t.Errorf("EmitHints=false but got hints: %v", rc.Hints) - } - // Structured fields still populated. - if len(rc.ManagedBy) != 1 { - t.Errorf("ManagedBy should still be populated: %+v", rc.ManagedBy) - } -} - func TestBuild_NilObj(t *testing.T) { if rc := Build(context.Background(), nil, Options{}); rc != nil { t.Errorf("Build(nil) = %+v, want nil", rc) @@ -457,7 +408,6 @@ func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, PolicyReports: reports, - EmitHints: true, }) if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) @@ -469,16 +419,6 @@ func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { if len(k.Top) != 0 { t.Errorf("basic tier must NOT emit Top[]; got %d entries: %+v", len(k.Top), k.Top) } - gotHint := false - for _, h := range rc.Hints { - if h == "Kyverno: 1 failing, 1 warning" { - gotHint = true - break - } - } - if !gotHint { - t.Errorf("expected Kyverno hint; got %v", rc.Hints) - } } func TestBuild_PolicyReports_DiagnosticTierIncludesTop(t *testing.T) { @@ -540,7 +480,6 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) b, err := json.MarshalIndent(rc, "", " ") if err != nil { diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go deleted file mode 100644 index a385f455f..000000000 --- a/pkg/resourcecontext/hints.go +++ /dev/null @@ -1,241 +0,0 @@ -package resourcecontext - -import ( - "fmt" - "sort" - "strings" -) - -// SynthesizeHints renders a short, deterministic prose summary of the -// structured fields in c. Returns at most maxHintsBasic lines for -// TierBasic; future tiers can expand the budget. -// -// Ordering is fixed (not data-driven) so golden tests stay stable across -// runs. No LLM is involved — every line maps to a single rule. -// -// Callers SHOULD NOT parse hints — the structured fields are the canonical -// surface. Hints exist solely as a prose convenience for AI consumers. -func SynthesizeHints(c *ResourceContext, tier ContextTier) []string { - if c == nil { - return nil - } - - max := maxHintsBasic - if tier == TierDiagnostic { - max = maxHintsDiagnostic - } - - out := make([]string, 0, max) - - if h := managedByHint(c.ManagedBy); h != "" { - out = append(out, h) - } - if h := issueHint(c.IssueSummary); h != "" { - out = append(out, h) - } - if h := auditHint(c.AuditSummary); h != "" { - out = append(out, h) - } - if h := runsOnHint(c.RunsOn); h != "" { - out = append(out, h) - } - if h := exposesHint(c.Exposes); h != "" { - out = append(out, h) - } - if h := selectedByHint(c.SelectedBy); h != "" { - out = append(out, h) - } - if h := scaledByHint(c.ScaledBy); h != "" { - out = append(out, h) - } - if h := usesHint(c.Uses); h != "" { - out = append(out, h) - } - if h := policyHint(c.PolicySummary); h != "" { - out = append(out, h) - } - - if len(out) > max { - out = out[:max] - } - if len(out) == 0 { - return nil - } - return out -} - -const ( - maxHintsBasic = 8 - maxHintsDiagnostic = 12 -) - -func managedByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - m := refs[0] - return fmt.Sprintf("Managed by %s %s", m.Kind, m.Name) -} - -func issueHint(s *IssueSummary) string { - if s == nil || s.Count == 0 { - return "" - } - noun := pluralize("issue", s.Count) - var b strings.Builder - fmt.Fprintf(&b, "%d %s", s.Count, noun) - if s.HighestSeverity != "" { - fmt.Fprintf(&b, " (%s", s.HighestSeverity) - if s.TopReason != "" { - fmt.Fprintf(&b, ": %s", s.TopReason) - } - b.WriteString(")") - } else if s.TopReason != "" { - fmt.Fprintf(&b, ": %s", s.TopReason) - } - return b.String() -} - -func auditHint(s *AuditSummary) string { - if s == nil || s.Count == 0 { - return "" - } - noun := pluralize("audit finding", s.Count) - if s.HighestSeverity == "" { - return fmt.Sprintf("%d %s", s.Count, noun) - } - return fmt.Sprintf("%d %s (%s)", s.Count, noun, s.HighestSeverity) -} - -func runsOnHint(r *ContextRef) string { - if r == nil { - return "" - } - return fmt.Sprintf("Running on node %s", r.Name) -} - -func exposesHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - return fmt.Sprintf("Exposed by %s", summarizeKindsCounts(refs)) -} - -func selectedByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - // Distinguish known SelectedBy kinds (PDB vs NetworkPolicy) in the hint — - // they read very differently to a human, and lumping them together loses - // signal. Match each kind explicitly: a future kind added to SelectedBy - // (e.g. ValidatingAdmissionPolicy) would otherwise be silently rendered - // as NetworkPolicy. Unrecognized kinds drop through to summarizeKindsCounts. - var pdb, np, other []ContextRef - for _, r := range refs { - switch r.Kind { - case "PodDisruptionBudget": - pdb = append(pdb, r) - case "NetworkPolicy": - np = append(np, r) - default: - other = append(other, r) - } - } - parts := make([]string, 0, 3) - if n := len(np); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) - } - if n := len(pdb); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) - } - if len(other) > 0 { - parts = append(parts, summarizeKindsCounts(other)) - } - return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) -} - -func selectVerb(n int) string { - if n == 1 { - return "selects this resource" - } - return "select this resource" -} - -func scaledByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - return fmt.Sprintf("Scaled by %s", summarizeKindsCounts(refs)) -} - -func usesHint(u *UsesBlock) string { - if u == nil { - return "" - } - parts := make([]string, 0, 4) - if n := len(u.ConfigMaps); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("ConfigMap", n))) - } - if n := len(u.Secrets); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("Secret", n))) - } - if n := len(u.PVCs); n > 0 { - parts = append(parts, fmt.Sprintf("%d PVCs", n)) - if n == 1 { - parts[len(parts)-1] = "1 PVC" - } - } - if u.ServiceAccount != nil { - parts = append(parts, fmt.Sprintf("ServiceAccount %s", u.ServiceAccount.Name)) - } - if len(parts) == 0 { - return "" - } - return "Uses " + strings.Join(parts, ", ") -} - -func policyHint(s *PolicySummary) string { - if s == nil || s.Kyverno == nil { - return "" - } - k := s.Kyverno - if k.Fail == 0 && k.Warn == 0 { - return "" - } - parts := make([]string, 0, 2) - if k.Fail > 0 { - parts = append(parts, fmt.Sprintf("%d failing", k.Fail)) - } - if k.Warn > 0 { - parts = append(parts, fmt.Sprintf("%d warning", k.Warn)) - } - return "Kyverno: " + strings.Join(parts, ", ") -} - -// summarizeKindsCounts groups refs by kind and emits "N Kind, M OtherKind" -// (deterministic order: alphabetical by kind). -func summarizeKindsCounts(refs []ContextRef) string { - counts := make(map[string]int) - for _, r := range refs { - counts[r.Kind]++ - } - kinds := make([]string, 0, len(counts)) - for k := range counts { - kinds = append(kinds, k) - } - sort.Strings(kinds) - parts := make([]string, 0, len(kinds)) - for _, k := range kinds { - parts = append(parts, fmt.Sprintf("%d %s", counts[k], pluralize(k, counts[k]))) - } - return strings.Join(parts, ", ") -} - -// pluralize returns word + "s" when n != 1. Kept English-only; resource -// kinds are loanwords (Pod, Service, etc.) so naive pluralization works. -func pluralize(word string, n int) string { - if n == 1 { - return word - } - return word + "s" -} diff --git a/pkg/resourcecontext/hints_test.go b/pkg/resourcecontext/hints_test.go deleted file mode 100644 index 8bc0a30d6..000000000 --- a/pkg/resourcecontext/hints_test.go +++ /dev/null @@ -1,118 +0,0 @@ -package resourcecontext - -import ( - "reflect" - "testing" -) - -func TestSynthesizeHints_NilCtx(t *testing.T) { - if got := SynthesizeHints(nil, TierBasic); got != nil { - t.Errorf("nil ctx: got %v, want nil", got) - } -} - -func TestSynthesizeHints_EmptyCtx(t *testing.T) { - rc := &ResourceContext{Tier: TierBasic} - got := SynthesizeHints(rc, TierBasic) - if got != nil { - t.Errorf("empty rc: got %v, want nil", got) - } -} - -func TestSynthesizeHints_DeterministicOrdering(t *testing.T) { - rc := &ResourceContext{ - ManagedBy: []ContextRef{{Kind: "Application", Name: "store"}}, - Exposes: []ContextRef{{Kind: "Service", Name: "api"}}, - SelectedBy: []ContextRef{ - {Kind: "NetworkPolicy", Name: "deny"}, - {Kind: "PodDisruptionBudget", Name: "pdb"}, - }, - ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "hpa"}}, - RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, - Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}}, - IssueSummary: &IssueSummary{Count: 2, HighestSeverity: "warning", TopReason: "Backoff"}, - AuditSummary: &AuditSummary{Count: 3, HighestSeverity: "danger"}, - } - want := []string{ - "Managed by Application store", - "2 issues (warning: Backoff)", - "3 audit findings (danger)", - "Running on node n1", - "Exposed by 1 Service", - "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", - "Scaled by 1 HorizontalPodAutoscaler", - "Uses 1 ConfigMap", - } - got := SynthesizeHints(rc, TierBasic) - if !reflect.DeepEqual(got, want) { - t.Errorf("hints mismatch:\n got: %v\nwant: %v", got, want) - } -} - -func TestSynthesizeHints_BasicTierCapped(t *testing.T) { - // Synthesize a maxed-out context and verify the basic tier caps at - // maxHintsBasic lines. This guards against unbounded hint growth. - rc := &ResourceContext{ - ManagedBy: []ContextRef{{Kind: "App", Name: "a"}}, - Exposes: []ContextRef{{Kind: "Service", Name: "svc"}}, - SelectedBy: []ContextRef{{Kind: "PodDisruptionBudget", Name: "p"}, {Kind: "NetworkPolicy", Name: "n"}}, - ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "h"}}, - RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, - Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}, Secrets: []ContextRef{{Kind: "Secret", Name: "s"}}}, - IssueSummary: &IssueSummary{Count: 1, HighestSeverity: "critical", TopReason: "Crash"}, - AuditSummary: &AuditSummary{Count: 1, HighestSeverity: "danger", TopFinding: "CKV_K8S_1"}, - PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Fail: 1, Warn: 1}}, - } - got := SynthesizeHints(rc, TierBasic) - if len(got) > maxHintsBasic { - t.Errorf("basic tier exceeded cap: got %d hints, want ≤%d (%v)", len(got), maxHintsBasic, got) - } -} - -func TestSynthesizeHints_IssueHint_NoSeverity(t *testing.T) { - rc := &ResourceContext{IssueSummary: &IssueSummary{Count: 1, TopReason: "Pending"}} - got := SynthesizeHints(rc, TierBasic) - want := []string{"1 issue: Pending"} - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestSynthesizeHints_PolicyHint_OnlyPass_Skipped(t *testing.T) { - rc := &ResourceContext{PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Pass: 3}}} - got := SynthesizeHints(rc, TierBasic) - if got != nil { - t.Errorf("only-pass summary should not emit a hint; got %v", got) - } -} - -func TestUsesHint_PVCSingular(t *testing.T) { - rc := &ResourceContext{Uses: &UsesBlock{PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data"}}}} - got := SynthesizeHints(rc, TierBasic) - want := []string{"Uses 1 PVC"} - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestSelectVerb(t *testing.T) { - if selectVerb(1) != "selects this resource" { - t.Errorf("verb(1): %q", selectVerb(1)) - } - if selectVerb(2) != "select this resource" { - t.Errorf("verb(2): %q", selectVerb(2)) - } -} - -func TestSummarizeKindsCounts_AlphabeticalOrder(t *testing.T) { - refs := []ContextRef{ - {Kind: "Service", Name: "a"}, - {Kind: "Ingress", Name: "b"}, - {Kind: "Service", Name: "c"}, - } - got := summarizeKindsCounts(refs) - want := "1 Ingress, 2 Services" - if got != want { - t.Errorf("got %q want %q", got, want) - } -} diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 15f5cfee7..6e98d0bec 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -20,10 +20,13 @@ package resourcecontext // response. Every field is optional; the zero value is a valid (empty) // "basic"-tier context. // -// Hints is an optional, presentation-only field — populated by AI-facing -// callers (MCP, /api/ai/*) and omitted by UI-facing callers. The structured -// fields above are the canonical facts; hints are a derived prose -// projection. +// All fields are structured. A prose `Hints []string` projection was +// considered (and prototyped) but cut from v1: our dominant agent consumer +// composes triage prose from the structured fields itself, the additional +// wire bytes earned no net signal, and once shipped, agents pattern-matching +// on hint substrings would have ossified the wording. If a real consumer +// emerges that needs deterministic prose, add it as a separate +// `explain_resource` tool rather than re-introducing it inline here. type ResourceContext struct { Tier ContextTier `json:"tier"` ManagedBy []ContextRef `json:"managedBy,omitempty"` @@ -35,7 +38,6 @@ type ResourceContext struct { IssueSummary *IssueSummary `json:"issueSummary,omitempty"` AuditSummary *AuditSummary `json:"auditSummary,omitempty"` PolicySummary *PolicySummary `json:"policySummary,omitempty"` - Hints []string `json:"hints,omitempty"` Omitted []OmittedField `json:"omitted,omitempty"` } diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index 2e891594c..f8174fe3e 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -105,7 +105,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { IssueSummary: &IssueSummary{Count: 1}, AuditSummary: &AuditSummary{Count: 2}, PolicySummary: &PolicySummary{}, - Hints: []string{"hint"}, Omitted: []OmittedField{{Field: "selectedBy", Reason: OmittedRBACDenied}}, } b, err := json.Marshal(ac) @@ -124,7 +123,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { `"issueSummary"`, `"auditSummary"`, `"policySummary"`, - `"hints"`, `"omitted"`, } prev := -1 @@ -197,7 +195,6 @@ func TestResourceContextRoundTrip(t *testing.T) { }}, }, }, - Hints: []string{"Managed by Deployment api"}, Omitted: []OmittedField{ {Field: "selectedBy.networkPolicies", Reason: OmittedRBACDenied}, {Field: "policySummary.kyverno", Reason: OmittedNotInstalled}, From 1b56e9ba11dcd0ba44cb6ff340b602d8c45ef0b9 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 13:27:45 +0300 Subject: [PATCH 24/33] docs(ai): declare /api/ai/* outside the OpenAPI spec, with reasoning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Radar is OpenAPI-first per CLAUDE.md, but that discipline was adopted to keep the Go backend and the TypeScript SPA client in sync — one spec, regenerated as server stubs + TS client. /api/ai/* has no SPA consumer; agents read MCP tool descriptions or in-prompt instructions, not OpenAPI specs. Without this doc, a future maintainer might reflexively bring /api/ai/* under openapi.yaml on the assumption that radar's OpenAPI-first stance covers every endpoint. That would pay the spec-authoring tax during agent-surface evolution without earning the SDK-generation benefit. The wave-2 wire shapes already evolved across three review rounds; locking them down in YAML during that churn would have produced spec/code drift, not contract clarity. Top-of-file doc records the intentional opt-out + the revisit triggers (surface stability + public-SDK commitment). Future readers see explicit reasoning instead of accidental omission. --- internal/server/ai_handlers.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index a188a92e1..1d7cf8a19 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -1,3 +1,30 @@ +// /api/ai/* is the REST mirror of the MCP agent surface. Both target AI +// consumers (Claude, scripted agents) rather than the SPA, and both +// intentionally evolve at agent-iteration speed. +// +// Unlike /api/* (consumed by the SPA via a generated TypeScript client), +// the /api/ai/* surface is NOT specified in openapi.yaml. The original +// motivation for OpenAPI-first in radar was frontend/backend type safety — +// one spec, regenerated as Go server stubs + TS client. That value +// proposition does not apply here: the agent consumer doesn't read +// OpenAPI specs (it reads MCP tool descriptions or in-prompt instructions), +// and the SPA doesn't call these endpoints at all. +// +// Wire shapes for the agent surface live in pkg/resourcecontext (typed +// JSON DTOs) and pkg/topology. MCP tools document their wire via +// jsonschema struct tags. /api/ai/* follows the same code-defined +// discipline as MCP, treating them as one logical surface served over +// two protocols. +// +// Revisit this opt-out when: +// (a) the agent surface stabilizes (no major shape changes for two +// release cycles), AND +// (b) Skyhook commits to a public customer-facing AI SDK that needs +// generated bindings. +// +// Until both conditions are met, bringing /api/ai/* under openapi.yaml +// is premature — it would pay the spec-authoring tax during evolution +// without earning the SDK-generation benefit. package server import ( From bb3fe6d1d8eed7af4d0735e00c9a841a4b28e056 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 14:44:50 +0300 Subject: [PATCH 25/33] fix(resourcecontext): pseudo-kind in fallback, swap CM/Secret reasons, 500 on unknown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Bugbot findings on PR #721: 1. Build's relationship fallback used `ident.Kind` directly (the raw GVK kind like "Service") when opts.Relationships was nil but opts.Topology was set. For Knative serving.knative.dev/Service this resolved to the CORE Service topology node, leaking core relationships into the CRD's resourceContext — defeating the KindForGVK fix that the handler-side pre-computation already applies. Mirror the handler's resolution here so the fallback agrees with the primary path. 2. buildUsesFromPod had the dominant-pattern Reason labels reversed: ConfigMaps got ReasonEnvVarRef, Secrets got ReasonVolumeMount. ConfigMaps surface primarily via volume mounts (config files); Secrets primarily via env (SecretKeyRef). Swapped to match the common case. Doc comment clarifies the label is best-effort per kind — both paths feed both kinds in practice. 3. writeAIFetchError default branch returned 404 NOT_FOUND for any unrecognized error. Unknown errors are server-side (e.g. "resource discovery not initialized") and should surface as 500, not be masked as missing-resource. One-line fix. No test changes — the fixes are correctness-only and existing tests remain valid. --- internal/server/ai_handlers.go | 5 ++++- pkg/resourcecontext/build.go | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 1d7cf8a19..cdca066ca 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -300,7 +300,10 @@ func (s *Server) writeAIFetchError(w http.ResponseWriter, kind string, err error case strings.Contains(msg, "not found"): s.writeError(w, http.StatusNotFound, msg) default: - s.writeError(w, http.StatusNotFound, msg) + // Unknown errors are server-side problems (e.g. "resource discovery + // not initialized", "dynamic resource cache not initialized") — surface + // as 500 so debugging upstream issues isn't masked by a misleading 404. + s.writeError(w, http.StatusInternalServerError, msg) } } diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 4f294a3bb..4ea5e3cdc 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -113,8 +113,15 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // over relationships per row pass them in directly. rel := opts.Relationships if rel == nil && opts.Topology != nil { + // Resolve the topology-pseudo-kind so cross-group CRDs (Knative + // serving.knative.dev/Service, CAPI cluster.x-k8s.io/Cluster, …) + // look up the right node. Using ident.Kind directly would lower- + // case to "service" and resolve to the core Service node, leaking + // the wrong resource's relationships into the CRD's resourceContext. + // The handler-side pre-computation does this same KindForGVK + // resolution; mirror it here so the fallback path doesn't undo it. rel = topology.GetRelationshipsWithObject( - ident.Kind, ident.Namespace, ident.Name, obj, + topology.KindForGVK(ident.Kind, ident.Group), ident.Namespace, ident.Name, obj, opts.Topology, opts.Provider, opts.DynamicProv, opts.RelIndex, ) } @@ -357,9 +364,16 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) + // ConfigMaps are most often surfaced via volume mounts (volume.ConfigMap) + // and Secrets via env (SecretKeyRef on container.env). Both kinds appear + // via both paths in practice, so the single Reason label per kind is a + // best-effort discriminator: it answers "what's the dominant lookup + // pattern for THIS kind?" — not "how was THIS particular ref discovered?" + // Reversing them (the prior labelling had them swapped) made the + // metadata actively misleading. uses := &UsesBlock{ - ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonEnvVarRef, SourceK8sSpec), "uses.configMaps", omitted), - Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonVolumeMount, SourceK8sSpec), "uses.secrets", omitted), + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonVolumeMount, SourceK8sSpec), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonEnvVarRef, SourceK8sSpec), "uses.secrets", omitted), PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), } From 444195a68da5a278fd01a9643215913a708c1879 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 17:53:48 +0300 Subject: [PATCH 26/33] chore(resourcecontext): drop speculative wire surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove fields and enum values with no consumer in v1 and no concrete plan to populate them in T7/T10/T13: - ContextRef.Reason (RefReason enum, 10 values): structurally redundant with the parent field name (selectedBy → selector match, runsOn → node binding, etc.). The ConfigMap/Secret refs were tagged with a single Reason that was inaccurate for the env-from vs volume-mount half of each set — actively misleading on the wire. - ContextRef.Source (RefSource enum, 5 values): internal provenance describing which radar subsystem produced the fact. No agent or UI consumer branches on it; planned policy_report/audit_engine sources never get a ContextRef anyway (T10 emits PolicySummary/AuditSummary rollups). - ContextRef.Confidence: reserved field, never set. Defer until fuzzy joins (Trivy/ConfigAudit) actually land. - Options.MaxTokens: reserved field, never enforced. Add when there's budgeting code. - ResourceContext.Truncated: never set by Build — no truncation path exists in the generator (RBAC drops are reported via Omitted). - OmittedKindUnsupported + OmittedProviderDisabled: unused enum values. Kept rbac_denied / budget_exceeded / cache_cold / not_installed — all four are populated today or planned for T10's policy-report diagnostic per the TODO in internal/k8s/policy_reports.go. --- pkg/resourcecontext/build.go | 50 ++++++++++--------------------- pkg/resourcecontext/build_test.go | 3 -- 2 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 4ea5e3cdc..46d2576a9 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -25,8 +25,7 @@ import ( // in internal/* pre-compute IssueSummary / AuditSummary / PolicyReports and // pass them in, so we don't reach into internal/issues or internal/audit. type Options struct { - Tier ContextTier - MaxTokens int // reserved for future budgeting; not enforced in v1 + Tier ContextTier // AccessChecker gates every emitted ContextRef. nil = no gating (treat // as fully authorized — local-kubeconfig / tests). @@ -141,7 +140,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } if len(managedBy) > 0 { rc.ManagedBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(managedBy, ReasonOwnerReference, SourceOwnerChain), + toContextRefs(managedBy), "managedBy", omitted) } @@ -153,18 +152,18 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte exposes = append(exposes, rel.Gateways...) exposes = append(exposes, rel.Routes...) rc.Exposes = filterRefs(ctx, opts.AccessChecker, - toContextRefs(exposes, ReasonLabelSelector, SourceTopology), + toContextRefs(exposes), "exposes", omitted) selected := make([]topology.ResourceRef, 0, len(rel.PDBs)+len(rel.NetworkPolicies)) selected = append(selected, rel.PDBs...) selected = append(selected, rel.NetworkPolicies...) rc.SelectedBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(selected, ReasonPodSelector, SourceTopology), + toContextRefs(selected), "selectedBy", omitted) rc.ScaledBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(rel.Scalers, ReasonScaleTargetRef, SourceTopology), + toContextRefs(rel.Scalers), "scaledBy", omitted) } @@ -187,8 +186,6 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte Group: rel.ServiceAccount.Group, Namespace: rel.ServiceAccount.Namespace, Name: rel.ServiceAccount.Name, - Reason: ReasonSAName, - Source: SourceK8sSpec, } if checkRef(ctx, opts.AccessChecker, candidate) { rc.Uses.ServiceAccount = candidate @@ -213,11 +210,9 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } if nodeName != "" { candidate := &ContextRef{ - Kind: "Node", - Group: nodeGroup, - Name: nodeName, - Reason: ReasonNodeName, - Source: SourceK8sSpec, + Kind: "Node", + Group: nodeGroup, + Name: nodeName, } if checkRef(ctx, opts.AccessChecker, candidate) { rc.RunsOn = candidate @@ -364,17 +359,10 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) - // ConfigMaps are most often surfaced via volume mounts (volume.ConfigMap) - // and Secrets via env (SecretKeyRef on container.env). Both kinds appear - // via both paths in practice, so the single Reason label per kind is a - // best-effort discriminator: it answers "what's the dominant lookup - // pattern for THIS kind?" — not "how was THIS particular ref discovered?" - // Reversing them (the prior labelling had them swapped) made the - // metadata actively misleading. uses := &UsesBlock{ - ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonVolumeMount, SourceK8sSpec), "uses.configMaps", omitted), - Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonEnvVarRef, SourceK8sSpec), "uses.secrets", omitted), - PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", ""), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", ""), "uses.secrets", omitted), + PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", ""), "uses.pvcs", omitted), } if sa := pod.Spec.ServiceAccountName; sa != "" { @@ -382,8 +370,6 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, Kind: "ServiceAccount", Namespace: pod.Namespace, Name: sa, - Reason: ReasonSAName, - Source: SourceK8sSpec, } if checkRef(ctx, ac, candidate) { uses.ServiceAccount = candidate @@ -478,7 +464,7 @@ func (s *refSet) add(name, ns string) { // refs returns the accumulated set as ContextRefs sorted by (namespace, name) // for deterministic golden output. -func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) []ContextRef { +func (s *refSet) refs(kind, group string) []ContextRef { if len(s.order) == 0 { return nil } @@ -496,8 +482,6 @@ func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) [] Group: group, Namespace: e.Namespace, Name: e.Name, - Reason: reason, - Source: source, } } return out @@ -507,10 +491,10 @@ func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) [] // Topology ref → ContextRef // --------------------------------------------------------------------------- -// toContextRefs translates a slice of topology.ResourceRef into ContextRefs -// with the given reason+source. Sorted by (kind, namespace, name) for -// determinism — golden tests rely on this ordering. -func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSource) []ContextRef { +// toContextRefs translates a slice of topology.ResourceRef into ContextRefs. +// Sorted by (kind, namespace, name) for determinism — golden tests rely on +// this ordering. +func toContextRefs(refs []topology.ResourceRef) []ContextRef { if len(refs) == 0 { return nil } @@ -521,8 +505,6 @@ func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSour Group: r.Group, Namespace: r.Namespace, Name: r.Name, - Reason: reason, - Source: source, }) } sort.SliceStable(out, func(i, j int) bool { diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index d14c474c9..9e14da46a 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -150,9 +150,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { if mb.Kind != "Application" || mb.Name != "storefront" || mb.Namespace != "argocd" { t.Errorf("ManagedBy[0]: got %+v, want Application argocd/storefront", mb) } - if mb.Source != SourceOwnerChain { - t.Errorf("ManagedBy[0].Source: got %q want %q", mb.Source, SourceOwnerChain) - } // Exposes: the Service routes to the pod. if got, want := len(rc.Exposes), 1; got != want { From 3e43759b32ba6bc37d97bd236d5709c46153d445 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 18:13:51 +0300 Subject: [PATCH 27/33] fix(resourcecontext): address Bugbot findings on T6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop the handler-side pre-compute of Relationships in buildAIResourceContext. Build's own fallback already does the identical KindForGVK pseudo-kind resolution; pre-computing here doubled the work whenever the lookup returned nil (handler call returned nil → opts.Relationships=nil → Build's `rel==nil && opts.Topology!=nil` branch re-ran the same scan). - Sort issue-summary rows by (severity desc, Reason asc) before picking topReason. The sibling computeAuditSummaryForResource already sorts (severity desc, CheckID asc) for determinism; the issue path's iteration-order pick could vary across runs when multiple rows tie on severity. --- internal/server/ai_handlers.go | 48 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index cdca066ca..789df5c7f 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -359,23 +359,12 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts.Topology = topo opts.Provider = prov opts.DynamicProv = dyn - // Pre-compute Relationships once with the already-fetched obj so - // kind/group disambiguation works (Knative serving.knative.dev/Service - // vs core/v1 Service). idx=nil is fine for single-resource: the - // per-call inline scan is O(E) once. Bulk callers (T12/T89) should - // build a shared index via topology.IndexByResource(topo). - // - // Route through KindForGVK so cross-group CRDs whose Kind collides - // with a core kind (Knative {Service, Configuration, Revision, Route}, - // CAPI Cluster, Istio Gateway) resolve to the right topology node. - // The builder writes those as pseudo-kinds (e.g. "knativeservice/...") - // — without remapping, buildNodeID would resolve "services/..." to - // the core Service node and walk the wrong edges. - gvk := obj.GetObjectKind().GroupVersionKind() - relKind := topology.KindForGVK(gvk.Kind, gvk.Group) - opts.Relationships = topology.GetRelationshipsWithObject( - relKind, namespace, name, obj, topo, prov, dyn, nil, - ) + // Relationships are computed inside Build via GetRelationshipsWithObject, + // which applies the same KindForGVK pseudo-kind remap we used to do + // here. Pre-computing in the handler doubled the work whenever the + // lookup returned nil (no edges): handler call returned nil, Build's + // `rel == nil && opts.Topology != nil` fallback re-ran the identical + // scan. Leaving opts.Relationships unset is the canonical path. } return resourcecontext.Build(r.Context(), obj, opts) @@ -442,9 +431,7 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n } rows, _ := issues.ComposeWithStats(provider, filters) - var count int - var topReason string - var topSeverity issues.Severity + matched := make([]issues.Issue, 0, len(rows)) bySource := make(map[string]int) for _, row := range rows { if row.Name != name { @@ -453,16 +440,25 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if namespace != "" && row.Namespace != namespace { continue } - count++ + matched = append(matched, row) bySource[string(row.Source)]++ - if topSeverity == "" || composeSeverityRank(row.Severity) > composeSeverityRank(topSeverity) { - topSeverity = row.Severity - topReason = row.Reason - } } - if count == 0 { + if len(matched) == 0 { return nil } + // Sort by (severity desc, Reason asc) so TopReason is deterministic + // across runs even when multiple rows tie on severity. Mirrors the + // stable sort applied in computeAuditSummaryForResource. + sort.Slice(matched, func(i, j int) bool { + ri, rj := composeSeverityRank(matched[i].Severity), composeSeverityRank(matched[j].Severity) + if ri != rj { + return ri > rj + } + return matched[i].Reason < matched[j].Reason + }) + count := len(matched) + topSeverity := matched[0].Severity + topReason := matched[0].Reason return &resourcecontext.IssueSummary{ Count: count, HighestSeverity: string(topSeverity), From 37983a1bf48e7ca1f036ea909124400ba337f09c Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 21:45:32 +0300 Subject: [PATCH 28/33] fix(audit): match issue summary's nil-namespace guard for cluster-scoped lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit computeAuditSummaryForResource unconditionally passed []string{namespace} to audit.RunFromCache, even when namespace=="" (cluster-scoped resources). That filters audit to literally namespace="" resources instead of scanning all namespaces. computeIssueSummaryForResource already guards with `if namespace != ""` — match it. Latent today since the audit suite doesn't currently cover cluster-scoped kinds, but the inconsistency would silently miss findings the moment a cluster-scoped check lands. --- internal/server/ai_handlers.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 789df5c7f..4bbca9cde 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -494,7 +494,16 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if cache == nil || kind == "" { return nil } - results := audit.RunFromCache(cache, []string{namespace}, nil) + // Match computeIssueSummaryForResource's guard: passing []string{""} to + // RunFromCache would filter to literally namespace="" resources instead + // of scanning all namespaces. Latent today since the audit suite + // doesn't cover cluster-scoped kinds, but the inconsistency would + // silently miss findings the moment a cluster-scoped check lands. + var namespaces []string + if namespace != "" { + namespaces = []string{namespace} + } + results := audit.RunFromCache(cache, namespaces, nil) if results == nil || len(results.Findings) == 0 { return nil } From 3742ffa1bfd4d1dacbcc3dd55e386d18deb8f994 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 22:45:11 +0300 Subject: [PATCH 29/33] =?UTF-8?q?fix:=20rebase=20fallout=20=E2=80=94=20gro?= =?UTF-8?q?up-aware=20FindingsFor=20+=20smoke-test=20import=20merge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two T6 fixes required after rebasing onto post-#723 (Kyverno) and post-#726 (RBAC reverse-lookup + Crossplane) main: - T11 (#723) threaded API group through the PolicyReport lookup, changing policyreports.Index.FindingsFor from (kind, ns, name) to (group, kind, ns, name). Updated resourcecontext.PolicyReportLookup interface, the build.go call site (passing ident.Group), the policyReportLookupAdapter in ai_handlers.go, and the test mock. Group threading is a strict improvement — two CRDs sharing kind+ns+name across API groups now get disjoint findings instead of one inheriting the other's. - internal/server/server_smoke_test.go acquired a `rbacv1` import on main (#726) at the same line our `networkingv1` import lands on this branch. Conflict resolution: keep both, sorted. --- internal/server/ai_handlers.go | 4 ++-- internal/server/server_smoke_test.go | 5 +---- pkg/resourcecontext/build.go | 4 ++-- pkg/resourcecontext/build_test.go | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 4bbca9cde..cd4a0f454 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -59,11 +59,11 @@ type policyReportLookupAdapter struct { idx *policyreports.Index } -func (a policyReportLookupAdapter) FindingsFor(kind, namespace, name string) []resourcecontext.KyvernoFinding { +func (a policyReportLookupAdapter) FindingsFor(group, kind, namespace, name string) []resourcecontext.KyvernoFinding { if a.idx == nil { return nil } - findings := a.idx.FindingsFor(kind, namespace, name) + findings := a.idx.FindingsFor(group, kind, namespace, name) if len(findings) == 0 { return nil } diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 21fb3e216..268283396 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -11,11 +11,8 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" -<<<<<<< HEAD - rbacv1 "k8s.io/api/rbac/v1" -======= networkingv1 "k8s.io/api/networking/v1" ->>>>>>> b01d112 (fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup) + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 46d2576a9..49170abf9 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -66,7 +66,7 @@ type Options struct { // Build does not import pkg/policyreports directly because callers may // adapt other policy engines into the same shape. type PolicyReportLookup interface { - FindingsFor(kind, namespace, name string) []KyvernoFinding + FindingsFor(group, kind, namespace, name string) []KyvernoFinding } // RefAccessChecker abstracts the RBAC check so this package doesn't import @@ -230,7 +230,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // counts only (fail/warn/pass); diagnostic tier adds the top[] // findings. Tier discrimination keeps the basic-tier wire size tight. if opts.PolicyReports != nil { - findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) + findings := opts.PolicyReports.FindingsFor(ident.Group, ident.Kind, ident.Namespace, ident.Name) if len(findings) > 0 { rc.PolicySummary = buildPolicySummary(findings, opts.Tier) } diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 9e14da46a..c1fe2a5c0 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -41,7 +41,7 @@ func (d denyChecker) CanRead(_ context.Context, group, kind, namespace string) b // mockPolicyReports implements PolicyReportLookup. type mockPolicyReports map[string][]KyvernoFinding -func (m mockPolicyReports) FindingsFor(kind, namespace, name string) []KyvernoFinding { +func (m mockPolicyReports) FindingsFor(group, kind, namespace, name string) []KyvernoFinding { return m[kind+"/"+namespace+"/"+name] } From feb0073b485dbdfb93032fa52e9d3c7c27f79d3d Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 22:45:25 +0300 Subject: [PATCH 30/33] test: extend fakeIssuesProvider with Kyverno methods for post-T11 Provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit T11 (#723) added KyvernoFindings + KyvernoStatus to the issues.Provider interface so the composer can route PolicyReport findings into the unified issue stream and surface index-lifecycle status. Our test fake didn't implement either, so summarycontext tests stopped compiling after the rebase onto post-T11 main. Returning nil/"" is correct for these tests: BuildIssueIndex doesn't read Kyverno findings (kindFilter was dropped in the prior commit; the index buckets problem+condition only), and KyvernoStatus is consumed by the issues meta block — not by the index path under test. --- internal/summarycontext/summarycontext_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/summarycontext/summarycontext_test.go b/internal/summarycontext/summarycontext_test.go index e1b68769d..f032e1f52 100644 --- a/internal/summarycontext/summarycontext_test.go +++ b/internal/summarycontext/summarycontext_test.go @@ -18,6 +18,7 @@ import ( "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" bp "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" "github.com/skyhook-io/radar/pkg/resourcecontext" "github.com/skyhook-io/radar/pkg/topology" ) @@ -63,6 +64,8 @@ func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string return nil, nil } func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } +func (f *fakeIssuesProvider) KyvernoFindings() []policyreports.SubjectFindings { return nil } +func (f *fakeIssuesProvider) KyvernoStatus() string { return "" } func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } From 9d6658a125718d904314574bf7dd285b11fe1e6f Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 23:55:29 +0300 Subject: [PATCH 31/33] fix: group-aware audit + issue summary lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-group kind collisions (e.g. core/Service vs serving.knative.dev/Service sharing namespace + name) previously caused audit/issue summaries to silently inherit findings from the wrong resource. The fetch and relationship paths in T6 were already group-aware; the summary lookups were not. pkg/audit: - Finding and ResourceGroup gain a Group field (omitempty on the wire). - ResourceKey signature changes to (group, kind, ns, name) — encoded "group|Kind|ns|name" with "|" delimiter matching the issue-source key in internal/summarycontext. - Exported GroupForBuiltinKind(kind) — single source of truth for the built-in (Kind→Group) map. buildResults populates Finding.Group via this helper so the 20 per-check emission sites stay terse. internal/server: - buildAIResourceContext threads canonicalGroup from obj GVK through to both computeIssueSummaryForResource and computeAuditSummaryForResource. - Audit lookup uses the new group-aware ResourceKey. - Issue summary adds a strict row.Group == group check inside the match loop. The previous kind-only filter could silently pull in a colliding core kind's issues for a CRD lookup. - UI audit drill-down (handleResourceAudit) explicitly passes group="" since the URL doesn't carry group today — comment points to #35 for the CRD-aware drill-down work. internal/issues: - Centralised Group resolution via resolveGroup(group, kind): use the explicit group if set, else fall back to audit.GroupForBuiltinKind. - Applied at fromProblem (legacy k8s.DetectProblems sites still emit Group="" for built-in workloads), fromAudit (passes through audit.Finding.Group which buildResults populates), fromWarningEvent (split apiVersion → group, with "v1"-shape → core). Pin tests: - TestResourceKey_GroupAware: distinct keys across groups. - TestIndexByResource_NoCrossGroupCollision: lookup per-group returns only its own findings. - TestGroupForBuiltinKind: the table. Closes #35. --- internal/issues/issues.go | 27 +++++++++++- internal/server/ai_handlers.go | 23 +++++++--- internal/server/audit_handlers.go | 10 +++-- pkg/audit/checks.go | 11 ++++- pkg/audit/helpers.go | 56 ++++++++++++++++++++---- pkg/audit/helpers_test.go | 71 +++++++++++++++++++++++++++++++ pkg/audit/types.go | 10 +++++ 7 files changed, 188 insertions(+), 20 deletions(-) create mode 100644 pkg/audit/helpers_test.go diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 2ed01b2bb..4a3764dae 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -303,6 +303,21 @@ func condTypeReason(condType, reason string) string { // Source-specific normalization // --------------------------------------------------------------------------- +// resolveGroup returns the explicit group if set, else falls back to the +// built-in (Kind→Group) table. Some legacy Problem emission sites in +// k8s.DetectProblems still leave Group="" for built-in workloads +// (Deployment, StatefulSet, etc.) — without this fallback, the +// group-aware consumer (computeIssueSummaryForResource) would silently +// drop those rows when looking up by canonical group like "apps". +// Centralised here so the (Kind→Group) map lives in one place across +// packages (pkg/audit owns the table; this is a pass-through). +func resolveGroup(group, kind string) string { + if group != "" { + return group + } + return bp.GroupForBuiltinKind(kind) +} + func fromProblem(p k8s.Problem, now time.Time) Issue { sev := SeverityWarning if p.Severity == "critical" { @@ -313,7 +328,7 @@ func fromProblem(p k8s.Problem, now time.Time) Issue { Severity: sev, Source: SourceProblem, Kind: p.Kind, - Group: p.Group, + Group: resolveGroup(p.Group, p.Kind), Namespace: p.Namespace, Name: p.Name, Reason: p.Reason, @@ -333,6 +348,7 @@ func fromAudit(fin bp.Finding, now time.Time) Issue { Severity: sev, Source: SourceAudit, Kind: fin.Kind, + Group: resolveGroup(fin.Group, fin.Kind), Namespace: fin.Namespace, Name: fin.Name, Reason: fin.CheckID, @@ -413,10 +429,19 @@ func fromWarningEvent(e *corev1.Event) Issue { if first.IsZero() { first = last } + // Event.InvolvedObject carries apiVersion (group/version); split out + // the group so cross-group consumers don't collide when a Knative + // Service and a core Service share name+ns. + group, _, _ := strings.Cut(e.InvolvedObject.APIVersion, "/") + if e.InvolvedObject.APIVersion != "" && !strings.Contains(e.InvolvedObject.APIVersion, "/") { + // "v1" → core group "". + group = "" + } return Issue{ Severity: SeverityWarning, Source: SourceEvent, Kind: e.InvolvedObject.Kind, + Group: resolveGroup(group, e.InvolvedObject.Kind), Namespace: e.Namespace, Name: e.InvolvedObject.Name, Reason: e.Reason, diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index cd4a0f454..2c3d74fa0 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -333,13 +333,15 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin // so the audit index lookup keys correctly. Falls back to the URL kind // only when TypeMeta is somehow empty; non-canonical input there would // silently mis-key the audit lookup. - canonicalKind := obj.GetObjectKind().GroupVersionKind().Kind + gvk := obj.GetObjectKind().GroupVersionKind() + canonicalKind := gvk.Kind if canonicalKind == "" { canonicalKind = kind } + canonicalGroup := gvk.Group - issueSum := computeIssueSummaryForResource(cache, canonicalKind, namespace, name) - auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) + issueSum := computeIssueSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) + auditSum := computeAuditSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, @@ -414,7 +416,7 @@ func (s *Server) topologyForContext(namespace string) (*topology.Topology, topol // summary silently collapses to nil. // // Returns nil when no issues match — Build then omits the IssueSummary field. -func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { +func computeIssueSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.IssueSummary { if cache == nil { return nil } @@ -440,6 +442,15 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if namespace != "" && row.Namespace != namespace { continue } + // Group-aware match: T11 populates Issue.Group for problem + + // condition sources, so a Knative serving.knative.dev/Service + // lookup won't pull in the core Service's issues (or vice + // versa). The fromAudit / fromEvent sources emit Group="" today + // — those correctly match only the core-group lookup, which is + // the existing behavior. + if row.Group != group { + continue + } matched = append(matched, row) bySource[string(row.Source)]++ } @@ -490,7 +501,7 @@ func composeSeverityRank(s issues.Severity) int { // CheckID as the ascending tiebreaker. Map iteration ordering does NOT // influence the choice — agents pinning regression tests on // resourceContext output rely on stable field values across runs. -func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { +func computeAuditSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.AuditSummary { if cache == nil || kind == "" { return nil } @@ -508,7 +519,7 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n return nil } idx := bpaudit.IndexByResource(results.Findings) - match := idx[bpaudit.ResourceKey(kind, namespace, name)] + match := idx[bpaudit.ResourceKey(group, kind, namespace, name)] if len(match) == 0 { return nil } diff --git a/internal/server/audit_handlers.go b/internal/server/audit_handlers.go index edf22d2d8..5465e638d 100644 --- a/internal/server/audit_handlers.go +++ b/internal/server/audit_handlers.go @@ -130,12 +130,16 @@ func (s *Server) handleAuditResource(w http.ResponseWriter, r *http.Request) { results = applyAuditSettings(results, getAuditConfig()) index := bp.IndexByResource(results.Findings) - // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment") - findings := index[bp.ResourceKey(kind, namespace, name)] + // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment"). + // This handler is the UI's per-resource audit drill-down — group isn't on + // the URL today (the UI doesn't list grouped CRDs here yet), so we look + // up with group="" which matches the built-ins the audit suite scans. + // When CRD audit lands (#35 follow-up), thread group through the URL. + findings := index[bp.ResourceKey("", kind, namespace, name)] if findings == nil { goKind := apiResourceToKind(kind) if goKind != kind { - findings = index[bp.ResourceKey(goKind, namespace, name)] + findings = index[bp.ResourceKey("", goKind, namespace, name)] } } if findings == nil { diff --git a/pkg/audit/checks.go b/pkg/audit/checks.go index 1e19dacb9..c61120190 100644 --- a/pkg/audit/checks.go +++ b/pkg/audit/checks.go @@ -935,6 +935,15 @@ func buildResults(findings []Finding) *ScanResults { categories[cat] = CategorySummary{} } + // Populate Group from the built-in (Kind→Group) table. Check emission + // sites leave Group="" so the per-check code stays terse — single + // point of truth here instead of every Finding{} literal. + for i := range findings { + if findings[i].Group == "" { + findings[i].Group = GroupForBuiltinKind(findings[i].Kind) + } + } + // Merge findings: same (resource, checkID) get combined into one finding // with messages joined, so multi-container workloads show all affected containers. type checkKey struct{ resource, checkID string } @@ -942,7 +951,7 @@ func buildResults(findings []Finding) *ScanResults { var dedupFindings []Finding for _, f := range findings { - key := checkKey{ResourceKey(f.Kind, f.Namespace, f.Name), f.CheckID} + key := checkKey{ResourceKey(f.Group, f.Kind, f.Namespace, f.Name), f.CheckID} if idx, exists := mergeIndex[key]; exists { dedupFindings[idx].Message += "; " + f.Message continue diff --git a/pkg/audit/helpers.go b/pkg/audit/helpers.go index aee6ebea1..681a5eaa2 100644 --- a/pkg/audit/helpers.go +++ b/pkg/audit/helpers.go @@ -6,19 +6,23 @@ import ( "strings" ) -// ResourceKey returns the index key for a resource: "Kind/namespace/name". -func ResourceKey(kind, namespace, name string) string { - if namespace == "" { - return fmt.Sprintf("%s//%s", kind, name) - } - return fmt.Sprintf("%s/%s/%s", kind, namespace, name) +// ResourceKey returns the index key for a resource: +// "group|Kind|namespace|name". Group goes first because both group and +// namespace can legitimately be empty independently — encoding group +// last would leave a cluster-scoped CRD key ambiguous with a +// namespaced core-group key under any 3-part parse. "|" is a safe +// delimiter — Kubernetes API groups follow DNS subdomain rules and +// can't contain it. Mirrors the same shape as the issue-source key in +// internal/summarycontext. +func ResourceKey(group, kind, namespace, name string) string { + return fmt.Sprintf("%s|%s|%s|%s", group, kind, namespace, name) } // IndexByResource builds a lookup map from ResourceKey → []Finding. func IndexByResource(findings []Finding) map[string][]Finding { m := make(map[string][]Finding) for _, f := range findings { - key := ResourceKey(f.Kind, f.Namespace, f.Name) + key := ResourceKey(f.Group, f.Kind, f.Namespace, f.Name) m[key] = append(m[key], f) } return m @@ -33,6 +37,7 @@ func GroupByResource(findings []Finding) []ResourceGroup { for _, fs := range index { g := ResourceGroup{ Kind: fs[0].Kind, + Group: fs[0].Group, Namespace: fs[0].Namespace, Name: fs[0].Name, Findings: fs, @@ -55,13 +60,46 @@ func GroupByResource(findings []Finding) []ResourceGroup { if groups[i].Warning != groups[j].Warning { return groups[i].Warning > groups[j].Warning } - return ResourceKey(groups[i].Kind, groups[i].Namespace, groups[i].Name) < - ResourceKey(groups[j].Kind, groups[j].Namespace, groups[j].Name) + return ResourceKey(groups[i].Group, groups[i].Kind, groups[i].Namespace, groups[i].Name) < + ResourceKey(groups[j].Group, groups[j].Kind, groups[j].Namespace, groups[j].Name) }) return groups } +// GroupForBuiltinKind maps a built-in Kubernetes Kind to the API group +// the audit suite scans it under. Returns "" for kinds the suite +// doesn't recognize — those don't get a populated Finding.Group, which +// means cross-group collision risk is bounded to the listed built-ins +// vs. third-party CRDs sharing the same Kind name. +// +// Kept here (next to ResourceKey) so the Kind→Group mapping lives in +// one place rather than every Finding{} emission site. buildResults +// populates Finding.Group via this helper before the index is built; +// per-check code stays terse and group-agnostic. +// +// Also reused by internal/issues to resolve Group on Problem-sourced +// rows that pre-date group-aware emission — keeps the (Kind→Group) +// table in one place across packages. +func GroupForBuiltinKind(kind string) string { + switch kind { + case "Pod", "Service", "ConfigMap", "Secret", "Node", "Namespace", + "PersistentVolume", "PersistentVolumeClaim", "ServiceAccount": + return "" + case "Deployment", "DaemonSet", "StatefulSet", "ReplicaSet": + return "apps" + case "Job", "CronJob": + return "batch" + case "HorizontalPodAutoscaler": + return "autoscaling" + case "Ingress", "NetworkPolicy": + return "networking.k8s.io" + case "PodDisruptionBudget": + return "policy" + } + return "" +} + // ApplySettings filters audit results based on ignored namespaces (with wildcard // patterns like *-system) and disabled checks. This is the shared implementation // used by all consumers (HTTP handlers, MCP, skyhook-connector). diff --git a/pkg/audit/helpers_test.go b/pkg/audit/helpers_test.go new file mode 100644 index 000000000..7ee965f71 --- /dev/null +++ b/pkg/audit/helpers_test.go @@ -0,0 +1,71 @@ +package audit + +import "testing" + +// TestResourceKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups produce distinct +// keys. Pre-fix, ResourceKey was group-blind: a Knative +// serving.knative.dev/Service "api" in "prod" collided with the core +// "" /Service "api" in "prod", and IndexByResource would conflate +// their findings (whichever Finding came last would shadow the other +// in the dedup checkKey, and any lookup by ResourceKey returned the +// pooled set). The fix routes Group through the key. +func TestResourceKey_GroupAware(t *testing.T) { + core := ResourceKey("", "Service", "prod", "api") + knative := ResourceKey("serving.knative.dev", "Service", "prod", "api") + if core == knative { + t.Fatalf("ResourceKey collides across groups: %q == %q", core, knative) + } +} + +// TestIndexByResource_NoCrossGroupCollision exercises the same fix +// end-to-end: emit two Findings for kind/ns/name "Service/prod/api", +// one with Group="" (core) and one with Group="serving.knative.dev" +// (Knative), and verify each lookup returns ONLY its own finding — +// not the union. +func TestIndexByResource_NoCrossGroupCollision(t *testing.T) { + findings := []Finding{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", CheckID: "core-finding"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", CheckID: "knative-finding"}, + } + idx := IndexByResource(findings) + + core := idx[ResourceKey("", "Service", "prod", "api")] + if len(core) != 1 || core[0].CheckID != "core-finding" { + t.Errorf("core lookup: got %+v, want 1 finding with CheckID=core-finding", core) + } + knative := idx[ResourceKey("serving.knative.dev", "Service", "prod", "api")] + if len(knative) != 1 || knative[0].CheckID != "knative-finding" { + t.Errorf("knative lookup: got %+v, want 1 finding with CheckID=knative-finding", knative) + } +} + +// TestGroupForBuiltinKind pins the (Kind→Group) table used by +// buildResults to populate Finding.Group for emission sites that leave +// it empty. Centralising the table here keeps per-check code terse; +// drift between this table and the actual API group a check scans +// would silently mis-key findings. +func TestGroupForBuiltinKind(t *testing.T) { + cases := map[string]string{ + "Pod": "", + "Service": "", + "ConfigMap": "", + "Secret": "", + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "ReplicaSet": "apps", + "Job": "batch", + "CronJob": "batch", + "HorizontalPodAutoscaler": "autoscaling", + "Ingress": "networking.k8s.io", + "NetworkPolicy": "networking.k8s.io", + "PodDisruptionBudget": "policy", + "UnknownCRD": "", + } + for kind, want := range cases { + if got := GroupForBuiltinKind(kind); got != want { + t.Errorf("GroupForBuiltinKind(%q) = %q, want %q", kind, got, want) + } + } +} diff --git a/pkg/audit/types.go b/pkg/audit/types.go index a5b8f91f4..4d7d63da3 100644 --- a/pkg/audit/types.go +++ b/pkg/audit/types.go @@ -63,8 +63,12 @@ type ScanResults struct { // ResourceGroup aggregates findings for a single resource. // Groups are sorted by severity (danger first), then by name. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. type ResourceGroup struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` Warning int `json:"warning"` @@ -88,8 +92,14 @@ type CategorySummary struct { } // Finding represents a single best-practice violation. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. Check emission sites leave Group="" — buildResults +// populates it via groupForBuiltinKind so the (Kind→Group) map lives +// in one place rather than every check function. type Finding struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` CheckID string `json:"checkID"` From 7a1f006de8a5ce791f7fb146af0e45e6520a95e8 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Tue, 19 May 2026 01:23:18 +0300 Subject: [PATCH 32/33] fix(resourcecontext): normalize audit severity to issue vocabulary on the wire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit auditSummary.highestSeverity used to emit raw audit vocabulary ("danger" / "warning") while sibling issueSummary.highestSeverity emits issue vocabulary ("critical" / "warning"). Two fields under the same resourceContext disagreeing on what "highest severity" means is a real wire-shape footgun — consumers parsing one will mis-handle the other. Mirrors the same mapping internal/issues.fromAudit applies when audit findings flow through the unified issue stream, so the two paths now agree. Empty / future audit severities pass through unchanged so the contract stays explicit if pkg/audit grows new values. Pinned by TestNormalizeAuditSeverity. --- internal/server/ai_handlers.go | 22 ++++++++++++++-- internal/server/ai_handlers_severity_test.go | 27 ++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 internal/server/ai_handlers_severity_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 2c3d74fa0..10593ccfd 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -533,15 +533,33 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, group, kind, names } return match[i].CheckID < match[j].CheckID }) - topSeverity := match[0].Severity topFinding := match[0].CheckID return &resourcecontext.AuditSummary{ Count: len(match), - HighestSeverity: topSeverity, + HighestSeverity: normalizeAuditSeverity(match[0].Severity), TopFinding: topFinding, } } +// normalizeAuditSeverity maps the audit suite's emission vocabulary +// ("danger" / "warning") onto the unified resourceContext severity +// scale ("critical" / "warning") used by issueSummary. Two sibling +// fields in the same response reporting severity in different +// vocabularies — "danger" vs "critical" — is a wire-shape footgun for +// consumers. Mirrors the same mapping internal/issues.fromAudit +// applies when audit findings flow through the unified issue stream. +// Empty / unknown severities pass through unchanged so the contract +// stays explicit if the audit suite ever grows new values. +func normalizeAuditSeverity(s string) string { + switch s { + case bpaudit.SeverityDanger: + return string(issues.SeverityCritical) + case bpaudit.SeverityWarning: + return string(issues.SeverityWarning) + } + return s +} + // auditSeverityRank orders audit finding severities ("danger" > "warning"). func auditSeverityRank(s string) int { switch s { diff --git a/internal/server/ai_handlers_severity_test.go b/internal/server/ai_handlers_severity_test.go new file mode 100644 index 000000000..abeadcef8 --- /dev/null +++ b/internal/server/ai_handlers_severity_test.go @@ -0,0 +1,27 @@ +package server + +import ( + "testing" + + "github.com/skyhook-io/radar/internal/issues" + bpaudit "github.com/skyhook-io/radar/pkg/audit" +) + +// Pin the audit→issue severity normalization on the AuditSummary wire. +// Without it, sibling resourceContext fields disagree on what "highest +// severity" means: audit emits "danger" while issueSummary emits +// "critical". Mirror the same mapping internal/issues.fromAudit uses +// for the unified issue stream so consumers see one vocabulary. +func TestNormalizeAuditSeverity(t *testing.T) { + cases := map[string]string{ + bpaudit.SeverityDanger: string(issues.SeverityCritical), + bpaudit.SeverityWarning: string(issues.SeverityWarning), + "": "", // empty stays empty — explicit contract + "unknown": "unknown", // future audit values pass through + } + for in, want := range cases { + if got := normalizeAuditSeverity(in); got != want { + t.Errorf("normalizeAuditSeverity(%q) = %q, want %q", in, got, want) + } + } +} From 3d33d73226f7599a358236673deec04f2161eaf7 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Thu, 21 May 2026 02:59:20 +0300 Subject: [PATCH 33/33] feat: T7 + log pipe primitives + content search + bench-driven description fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundle of bench-validated agent-surface improvements developed and tested on this combined branch. Not yet split into per-task PRs — see the wave-3 plan for the eventual split. T7 — MCP get_resource default-on resourceContext - internal/mcp/rc_rbac.go: request-scoped RefAccessChecker adapter - internal/mcp/resource_context.go: MCP-side IssueSummary/AuditSummary/ PolicyReport composers + topology lookup - internal/mcp/tools.go: getResourceInput.Context, group-first dispatch, Build wrapping into {resource, resourceContext} envelope resourcecontext additions (T7-supporting + sibling) - ServiceBackends lookup interface (resolve Service selector → ready Pods) - PodSummary, WorkloadSummary, StatusSummary projections - Owner ref handling + audit severity normalization fallthroughs - Tests across build.go and types.go Search content-field matching (B5 win driver) - internal/search/score.go: ContentField type, content-exact/substring scoring, MatchSnippet generation with bounded excerpt size - internal/search/candidate.go: candidate match assembly - internal/search/types.go: MatchSnippet struct + content:path site - Hooked into ConfigMap/Secret data, workload env refs, CRD spec fields Log pipe primitives (kubectl-arm parity) - pkg/ai/context/logs.go: FilterLogsByPattern (server-side grep) - internal/mcp/tools.go + tools_workloads.go: get_pod_logs + get_workload_logs accept grep, since (kubectl --since), previous (kubectl -p for CrashLoopBackOff diagnosis) - parseLogsSince helper with full unit tests Tool description tightening (descriptions are the highest-leverage knob) - tools_audit.go: "compliant resources are not returned" — addresses the B7 audit false-positive vector - tools_helm.go, tools_neighborhood.go, tools_rbac.go, tools_apply.go, tools_workloads.go: minor description fixes --- internal/mcp/rc_rbac.go | 91 +++++ internal/mcp/resource_context.go | 224 +++++++++++ internal/mcp/tools.go | 324 ++++++++++++---- internal/mcp/tools_apply.go | 1 - internal/mcp/tools_audit.go | 19 +- internal/mcp/tools_filter_test.go | 72 ++++ internal/mcp/tools_helm.go | 4 +- internal/mcp/tools_neighborhood.go | 2 +- internal/mcp/tools_rbac.go | 12 +- internal/mcp/tools_workloads.go | 61 ++- internal/search/candidate.go | 77 ++++ internal/search/score.go | 105 ++++- internal/search/score_test.go | 58 ++- internal/search/search.go | 11 +- internal/search/search_test.go | 63 +++ internal/search/types.go | 12 +- internal/server/ai_handlers.go | 31 +- pkg/ai/context/logs.go | 20 + pkg/ai/context/logs_test.go | 27 ++ pkg/resourcecontext/build.go | 589 ++++++++++++++++++++++++++++- pkg/resourcecontext/build_test.go | 398 +++++++++++++++++++ pkg/resourcecontext/types.go | 183 ++++++++- 22 files changed, 2212 insertions(+), 172 deletions(-) create mode 100644 internal/mcp/rc_rbac.go create mode 100644 internal/mcp/resource_context.go diff --git a/internal/mcp/rc_rbac.go b/internal/mcp/rc_rbac.go new file mode 100644 index 000000000..393c26ee3 --- /dev/null +++ b/internal/mcp/rc_rbac.go @@ -0,0 +1,91 @@ +package mcp + +import ( + "context" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// requestScopedChecker adapts the MCP-side RBAC helpers +// (canReadInNamespace / canReadClusterScopedKind) into +// resourcecontext.RefAccessChecker with a request-local memoization layer +// keyed on (verb, group, kind, namespace). +// +// A single resourceContext build emits ~30 candidate refs but only ~5 +// distinct (group, kind, namespace) tuples — caching here collapses the +// SAR fan-out before reaching the inner per-user cache. Mirrors the REST +// equivalent in internal/server/rc_rbac.go so the two surfaces share the +// same enforcement story. +// +// Request-scoped (not server-scoped): per-user caching already lives one +// layer down. This layer only deduplicates the burst a single Build +// invocation generates. +type requestScopedChecker struct { + ctx context.Context + cache map[string]bool +} + +// newMCPRequestScopedChecker returns a checker scoped to a single MCP +// tool call. Not safe for concurrent use across calls. +func newMCPRequestScopedChecker(ctx context.Context) *requestScopedChecker { + return &requestScopedChecker{ + ctx: ctx, + cache: make(map[string]bool, 8), + } +} + +// CanRead implements resourcecontext.RefAccessChecker. +// +// Authorization rules mirror the REST adapter: +// - Namespaced kinds: SAR on (verb=get, group, resource, namespace). +// - Cluster-scoped kinds (namespace == ""): SAR on (verb=get, group, +// resource, ""). +// - Unknown kinds (not in discovery, not in static catalogue) pass +// through — Build only emits refs whose kinds are known to the +// topology builder, and an unknown kind here is a temporary +// discovery-cold state, not a permission bypass vector. +func (c *requestScopedChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + key := "get|" + group + "|" + kind + "|" + namespace + if v, ok := c.cache[key]; ok { + return v + } + + resource := lookupResourceName(kind, group) + if resource == "" { + c.cache[key] = true + return true + } + + var allowed bool + if namespace == "" { + allowed = canReadClusterScopedKind(c.ctx, kind, group, "get") + } else { + allowed = canReadInNamespace(c.ctx, group, resource, namespace, "get") + } + c.cache[key] = allowed + return allowed +} + +// Compile-time assertion that requestScopedChecker satisfies the contract. +var _ resourcecontext.RefAccessChecker = (*requestScopedChecker)(nil) + +// lookupResourceName resolves a (kind, group) pair to the canonical plural +// resource name used by SubjectAccessReview. Tries the static cluster-only +// catalogue first (covers Nodes / ClusterRoles / etc.), then discovery for +// everything else including CRDs. Returns "" when neither path knows the +// kind. Mirrors internal/server/rc_rbac.go's helper of the same name. +func lookupResourceName(kind, group string) string { + if kind == "" { + return "" + } + if g, r, ok := k8s.ClusterOnlyKindGVR(kind); ok && (group == "" || group == g) { + return r + } + if disc := k8s.GetResourceDiscovery(); disc != nil { + if ar, ok := disc.GetResourceWithGroup(kind, group); ok { + return ar.Name + } + } + return "" +} diff --git a/internal/mcp/resource_context.go b/internal/mcp/resource_context.go new file mode 100644 index 000000000..6886da284 --- /dev/null +++ b/internal/mcp/resource_context.go @@ -0,0 +1,224 @@ +package mcp + +import ( + "sort" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/audit" + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" + "github.com/skyhook-io/radar/pkg/resourcecontext" + topo "github.com/skyhook-io/radar/pkg/topology" +) + +// mcpPolicyReportLookupAdapter wraps k8s.GetPolicyReportIndex into the +// resourcecontext.PolicyReportLookup interface. Mirrors the REST adapter in +// internal/server/ai_handlers.go — keeping the projection narrow here lets +// pkg/policyreports.Finding evolve without perturbing the wire contract. +type mcpPolicyReportLookupAdapter struct { + idx *policyreports.Index +} + +func (a mcpPolicyReportLookupAdapter) FindingsFor(group, kind, namespace, name string) []resourcecontext.KyvernoFinding { + if a.idx == nil { + return nil + } + findings := a.idx.FindingsFor(group, kind, namespace, name) + if len(findings) == 0 { + return nil + } + out := make([]resourcecontext.KyvernoFinding, len(findings)) + for i, f := range findings { + out[i] = resourcecontext.KyvernoFinding{ + Policy: f.Policy, + Rule: f.Rule, + Result: f.Result, + Message: f.Message, + } + } + return out +} + +type mcpServiceBackendLookup struct { + cache *k8s.ResourceCache +} + +func (l mcpServiceBackendLookup) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + if l.cache == nil || l.cache.Pods() == nil { + return nil, nil + } + return l.cache.Pods().Pods(namespace).List(selector) +} + +// computeMCPIssueSummary rolls up per-resource issue-composer rows +// (problem + condition + optional audit) into an IssueSummary. Mirrors the +// REST handler's computeIssueSummaryForResource — same composer call, same +// group-aware iteration filter, same deterministic sort. The composer's +// native namespace filter restricts the scan to the resource's namespace; +// the per-row group check prevents cross-group collisions where a CRD and +// a built-in share kind+ns+name. +// +// Pascal-singular kind required: the composer's Filters.Kinds matcher +// case-folds both sides but doesn't plural-to-singular convert. Callers +// pass canonicalKind from obj's TypeMeta. +func computeMCPIssueSummary(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.IssueSummary { + if cache == nil { + return nil + } + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + filters := issues.Filters{ + Kinds: []string{kind}, + Limit: issues.MaxLimit, + } + if namespace != "" { + filters.Namespaces = []string{namespace} + } + rows, _ := issues.ComposeWithStats(provider, filters) + + matched := make([]issues.Issue, 0, len(rows)) + bySource := make(map[string]int) + for _, row := range rows { + if row.Name != name { + continue + } + if namespace != "" && row.Namespace != namespace { + continue + } + if row.Group != group { + continue + } + matched = append(matched, row) + bySource[string(row.Source)]++ + } + if len(matched) == 0 { + return nil + } + // (severity desc, Reason asc) — deterministic across runs. + sort.Slice(matched, func(i, j int) bool { + ri, rj := mcpComposeSeverityRank(matched[i].Severity), mcpComposeSeverityRank(matched[j].Severity) + if ri != rj { + return ri > rj + } + return matched[i].Reason < matched[j].Reason + }) + return &resourcecontext.IssueSummary{ + Count: len(matched), + HighestSeverity: string(matched[0].Severity), + TopReason: matched[0].Reason, + BySource: bySource, + } +} + +func mcpComposeSeverityRank(s issues.Severity) int { + switch s { + case issues.SeverityCritical: + return 2 + case issues.SeverityWarning: + return 1 + } + return 0 +} + +// computeMCPAuditSummary looks up audit findings for the subject resource +// via the group-aware (group, Kind, ns, name) key. Mirrors the REST +// handler's computeAuditSummaryForResource. +// +// kind MUST be Pascal singular — the audit check runner writes that into +// Finding.Kind, and Finding.Group is populated by audit.buildResults via +// the built-in (Kind→Group) table, so the lookup keys correctly. +func computeMCPAuditSummary(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil || kind == "" { + return nil + } + var namespaces []string + if namespace != "" { + namespaces = []string{namespace} + } + results := audit.RunFromCache(cache, namespaces, nil) + if results == nil || len(results.Findings) == 0 { + return nil + } + idx := bpaudit.IndexByResource(results.Findings) + match := idx[bpaudit.ResourceKey(group, kind, namespace, name)] + if len(match) == 0 { + return nil + } + + sort.Slice(match, func(i, j int) bool { + ri, rj := mcpAuditSeverityRank(match[i].Severity), mcpAuditSeverityRank(match[j].Severity) + if ri != rj { + return ri > rj + } + return match[i].CheckID < match[j].CheckID + }) + + return &resourcecontext.AuditSummary{ + Count: len(match), + HighestSeverity: mcpNormalizeAuditSeverity(match[0].Severity), + TopFinding: match[0].CheckID, + } +} + +func mcpAuditSeverityRank(s string) int { + switch s { + case bpaudit.SeverityDanger: + return 2 + case bpaudit.SeverityWarning: + return 1 + } + return 0 +} + +// mcpNormalizeAuditSeverity maps the audit suite's emission vocabulary +// ("danger" / "warning") onto the unified resourceContext severity scale +// ("critical" / "warning") used by issueSummary. Two sibling fields in +// the same response reporting severity in different vocabularies is a +// wire-shape footgun — mirror the REST handler's normalizeAuditSeverity. +func mcpNormalizeAuditSeverity(s string) string { + switch s { + case bpaudit.SeverityDanger: + return string(issues.SeverityCritical) + case bpaudit.SeverityWarning: + return string(issues.SeverityWarning) + } + return s +} + +// mcpTopologyForContext returns a per-call topology snapshot scoped to the +// resource's namespace (cluster-scoped resources get an all-namespaces +// build). Reuses the package-level summaryCtxTopoMemo cache to amortize +// build cost across get_resource and list_resources / search calls. nil +// return is fine — Build then skips topology-derived fields and the +// remaining sidecar still populates. +func mcpTopologyForContext(namespace string) (*topo.Topology, topo.ResourceProvider, topo.DynamicProvider, bool) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, nil, false + } + opts := topo.DefaultBuildOptions() + if namespace != "" { + opts.Namespaces = []string{namespace} + } + provider := k8s.NewTopologyResourceProvider(cache) + dyn := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + topology, err := summaryCtxTopoMemo.Get(opts, func() (*topo.Topology, error) { + return topo.NewBuilder(provider).WithDynamic(dyn).Build(opts) + }) + if err != nil || topology == nil { + return nil, nil, nil, false + } + return topology, provider, dyn, true +} + +// _ guards the imports used by Build's Options struct from being marked +// unused if the helpers above ever drop their references during refactors. +var _ = runtime.Object(nil) diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 7852a106a..79649b8fd 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "log" + "regexp" "sort" "strings" "time" @@ -14,6 +15,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/filter" "github.com/skyhook-io/radar/internal/helm" @@ -23,6 +25,7 @@ import ( "github.com/skyhook-io/radar/internal/summarycontext" "github.com/skyhook-io/radar/internal/timeline" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" topology "github.com/skyhook-io/radar/pkg/topology" ) @@ -31,64 +34,92 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_dashboard", - Description: "Get cluster health overview including resource counts, " + - "problems (failing pods, unhealthy deployments), recent warning events, " + - "and Helm release status. Start here to understand cluster state before " + - "drilling into specific resources.", + Description: "Use for inventory-style cluster or namespace health triage, like " + + "`kubectl get all` plus detected problems and warning events in one call. " + + "Returns resource counts, failing pods, unhealthy workloads, recent Warning " + + "events, and Helm release status so you can rank likely suspects before " + + "calling get_resource or logs. Routing: unknown broken thing -> issues; " + + "content/name search -> search; service routing/dependencies -> get_topology " + + "or get_neighborhood; inventory/counts/Helm/events overview -> get_dashboard.", Annotations: readOnly, }, logToolCall("get_dashboard", handleGetDashboard)) mcp.AddTool(server, &mcp.Tool{ Name: "list_resources", - Description: "List Kubernetes resources of a given kind with minified summaries. " + - "Supports all built-in kinds (pods, deployments, services, etc.) and CRDs. " + - "Use to discover what's running before inspecting individual resources.", + Description: "Use for a jq-like namespace sweep when you know the resource kind " + + "(pods, deployments, services, configmaps, CRDs). Returns compact Kubernetes-shaped " + + "rows plus summaryContext by default (managedBy, health, issueCount) so you can " + + "compare many similar resources and pick suspects before calling get_resource. " + + "For unknown kind/name searches, use search. For broad health triage, use " + + "get_dashboard or issues first.", Annotations: readOnly, }, logToolCall("list_resources", handleListResources)) mcp.AddTool(server, &mcp.Tool{ Name: "get_resource", - Description: "Get detailed information about a single Kubernetes resource. " + - "Returns minified spec, status, and metadata. " + - "Use after list_resources to drill into a specific resource. " + - "Optionally include related context (events, relationships, metrics, logs) " + - "using the 'include' parameter (comma-separated) to avoid extra tool calls.", + Description: "Use AFTER narrowing to one resource. Returns the resource's " + + "Kubernetes-shaped spec/status/metadata plus resourceContext when available " + + "(relationships, refs, issue/audit/policy rollups). This is the drill-down " + + "tool, not the best first call for broad incidents. Start with issues, " + + "get_dashboard, search, or list_resources to rank candidates; then call " + + "get_resource for the exact object. If you are looking for a string across " + + "ConfigMaps, CRD specs, env refs, or object content, use search instead of " + + "fetching resources one by one. Use the group parameter for ambiguous " + + "kinds such as Knative Service vs core Service.", Annotations: readOnly, }, logToolCall("get_resource", handleGetResource)) mcp.AddTool(server, &mcp.Tool{ Name: "get_topology", - Description: "Get the topology graph showing relationships between Kubernetes resources. " + - "Returns nodes and edges representing Deployments, Services, Ingresses, Pods, etc. " + - "Use 'traffic' view for network flow or 'resources' view for ownership hierarchy.", + Description: "Use to map a multi-service incident or dependency graph, preferably " + + "scoped to a namespace. " + + "Returns Kubernetes resource nodes and edges (Services, workloads, Pods, " + + "Ingresses, ConfigMaps, Secrets, owners) so you can see service-to-workload " + + "traffic and ownership relationships instead of inspecting resources one by one. " + + "Use view=traffic for routing/connectivity questions and view=resources for " + + "ownership/deployment hierarchy. Always specify namespace unless you specifically " + + "need a cross-namespace graph. If you already know the suspicious root, use " + + "get_neighborhood for a smaller focused graph.", Annotations: readOnly, }, logToolCall("get_topology", handleGetTopology)) mcp.AddTool(server, &mcp.Tool{ Name: "get_neighborhood", - Description: "Get the BFS-expanded neighborhood of a specific resource — the slice " + - "of the topology graph immediately relevant to one root. Cheaper and more " + - "focused than get_topology when you already know which resource you care " + - "about. Profile is 'auto' (default — picks a bounded edge set from the root " + - "kind) or 'all' (every edge type). Hops controls BFS depth (default 1, max " + - "2). Nodes are RBAC-filtered against the caller; dropped neighbors are " + - "listed in `omitted` with reason=rbac_denied. If max_nodes is exceeded " + - "mid-expansion, truncated=true is set and a partial subgraph is returned.", + Description: "Use when investigating cross-resource failures around a known " + + "resource: service routing, targetPort/selector/endpoints problems, dependency " + + "timeouts, config/secret refs, owner chains, or traffic not reaching pods. " + + "Returns the BFS-expanded topology neighborhood around one root, which is " + + "usually cheaper and clearer than get_topology once you have a suspect. " + + "Typical flow: issues/search/list_resources identify a Service or workload, " + + "then get_neighborhood traces its upstream/downstream Services, workloads, " + + "Pods, refs, and owners. Profile auto (default) picks a bounded edge set " + + "from the root kind; profile all expands every edge type and is heavier, " + + "use it only when auto produced a too-narrow neighborhood. Hops defaults to " + + "1 and maxes at 2. Nodes are RBAC-filtered; denied neighbors appear only as " + + "aggregate omitted counts.", Annotations: readOnly, }, logToolCall("get_neighborhood", handleGetNeighborhood)) mcp.AddTool(server, &mcp.Tool{ Name: "get_events", - Description: "Get recent Kubernetes warning events, deduplicated and sorted by recency. " + - "Useful for diagnosing issues — shows event reason, message, and occurrence count.", + Description: "Use for recent Kubernetes Warning events after an overview points " + + "at a namespace or resource, or when the symptom is scheduling, pulling images, " + + "restarts, failed mounts, readiness, or controller errors. Events are deduplicated " + + "and sorted by recency with reason, message, and count. For a ranked issue list " + + "that includes problems/conditions, use issues first.", Annotations: readOnly, }, logToolCall("get_events", handleGetEvents)) mcp.AddTool(server, &mcp.Tool{ Name: "get_pod_logs", - Description: "Get filtered log lines from a pod, prioritizing errors and warnings. " + - "Returns diagnostically relevant lines (errors, panics, stack traces) or " + - "falls back to the last 20 lines if no error patterns match.", + Description: "Use only after narrowing to a specific Pod/container. Returns " + + "diagnostically relevant log lines (errors, panics, stack traces, warnings) " + + "or falls back to recent tail lines. Set grep to server-side filter like " + + "`kubectl logs | grep PATTERN` when you know an error string, request path, " + + "service name, or trace id. For broad incidents, first use issues, " + + "get_dashboard, search, list_resources, or get_neighborhood to avoid reading " + + "logs from many unrelated pods. If the target is a config value, feature flag, " + + "CRD field, env ref, or YAML/spec content, use search rather than logs.", Annotations: readOnly, }, logToolCall("get_pod_logs", handleGetPodLogs)) @@ -101,9 +132,12 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_changes", - Description: "Get recent resource changes (creates, updates, deletes) from the cluster timeline. " + - "Use to investigate what changed before an incident. " + - "Filter by namespace, resource kind, or specific resource name.", + Description: "Use when the symptom is 'this worked earlier' or 'something broke " + + "after a deploy/config change.' Returns a chronological feed of resource " + + "creates, updates, and deletes such as image changes, ConfigMap edits, scale " + + "events, label edits, and rollout churn. This is often faster than reading " + + "ReplicaSet histories or individual audit/log streams. Pair with since to " + + "bound the window; filter by namespace, kind, or name when you know the scope.", Annotations: readOnly, }, logToolCall("get_changes", handleGetChanges)) @@ -115,7 +149,10 @@ func registerTools(server *mcp.Server) { "with remediation guidance. Checks cover security (running as root, privileged " + "containers, dangerous capabilities), reliability (missing probes, single replicas, " + "no PDB), and efficiency (missing resource requests/limits). " + - "Each finding includes what's wrong and how to fix it. " + + "Each finding includes what's wrong and how to fix it. The findings list contains " + + "only resources with audit violations; resources absent from findings should not " + + "be reported as non-compliant. If findings is empty for the requested scope/filter, " + + "there are no audit violations to report for that scope/filter. " + "Respects user's audit settings (ignored namespaces, disabled checks). " + "Filter by namespace, category, or severity.", Annotations: readOnly, @@ -137,7 +174,7 @@ func registerTools(server *mcp.Server) { Description: "Get detailed information about a specific Helm release including owned resources " + "and their status. Optionally include values, revision history, or manifest diff between revisions " + "using the 'include' parameter (comma-separated: values, history, diff). " + - "For diff, also provide diff_revision_1 and optionally diff_revision_2.", + "diff_revision_1 and diff_revision_2 are only used when include contains diff.", Annotations: readOnly, }, logToolCall("get_helm_release", handleGetHelmRelease)) @@ -177,8 +214,10 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "issues", - Description: "Unified cluster-health view. Combines hardcoded problem detection " + - "(failing Deployments / StatefulSets / CronJobs / HPAs / Nodes / Jobs / PVCs), " + + Description: "Use when the symptom is broad, unknown, or you need a ranked " + + "list of likely broken resources. This is the fastest way to find what is " + + "unhealthy before inspecting individual objects. Combines hardcoded problem " + + "detection (failing Deployments / StatefulSets / CronJobs / HPAs / Nodes / Jobs / PVCs), " + "recent K8s Warning events, and a generic CRD .status.conditions[] " + "fallback that lights up Argo / Flux / Knative / Crossplane / cert-manager / " + "KEDA without per-integration code. Severity is normalized to " + @@ -187,13 +226,17 @@ func registerTools(server *mcp.Server) { "(PolicyReport findings) are excluded by default because each can run " + "50–1000+ rows per cluster. The `source` param is a FILTER: " + "source=kyverno returns ONLY Kyverno rows (no problems, no conditions). " + + "For compliance, security-posture, or audit questions, use source=audit " + + "only; source=problem, source=condition, and source=event are runtime " + + "health signals and should not be reported as audit violations. " + + "When source includes audit, only resources with audit findings are returned; " + + "do not infer or report audit violations for resources that are absent. " + "To ADD an excluded source to the defaults via MCP, list everything " + "you want explicitly — e.g. source=problem,condition,kyverno returns " + - "defaults plus Kyverno. (The REST /api/issues endpoint also exposes " + - "include_audit / include_events / include_kyverno boolean flags as " + - "shortcuts, but MCP only takes the source list.) Use this instead of " + - "get_dashboard when you want the full health picture across all " + - "sources, or to filter by severity / source / kind / namespace.", + "defaults plus Kyverno. For a broader inventory plus Helm and event overview, " + + "use get_dashboard. After identifying a suspect issue, call get_resource for " + + "exact spec/status or get_neighborhood when the failure likely crosses " + + "Services/workloads/Pods/dependencies.", Annotations: readOnly, }, logToolCall("issues", handleIssuesTool)) @@ -201,15 +244,21 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "search", - Description: "Free-text resource search across this cluster's cache. Matches on " + - "name, namespace, label values, annotation values, container images, and " + - "kind. Tokens are AND'd. Modifiers: kind:Pod, ns:foo, label:app=bar, " + - "image:redis. Returns ranked hits with optional summary or raw object. " + - "Use this instead of list_resources when you don't already know the kind, " + - "namespace, or exact name — for example 'find anything called redis' or " + - "'show me everything pulling from quay.io/x'. Searches typed kinds plus " + - "any CRDs already warmed in the cache; cold CRDs need a list_resources " + - "call first to start watching.", + Description: "Use when you do not know the exact kind, namespace, or name, or " + + "when you need a grep-like scan across cached Kubernetes objects. Matches " + + "name, namespace, label values, annotation values, container images, kind, " + + "and searchable object content such as ConfigMap data, spec fields, status " + + "messages, env refs, and CRD specs. Tokens are AND'd. Examples: " + + "`adServiceFailure` finds feature flags in ConfigMap data; " + + "`kind:NetworkChaos delay` or `kind:PodChaos app=cart` finds Chaos Mesh " + + "faults; `image:flagd` finds feature-flag infrastructure. Modifiers include " + + "kind:Pod, kind:NetworkChaos, ns:foo, label:app=bar, image:redis. Returns ranked hits with matched " + + "content snippets and summaryContext by default so you can rank suspects " + + "before get_resource. Use this for feature flags, Chaos Mesh objects, " + + "secret/config refs, unknown CRD names, or 'where does this string appear?' " + + "questions. Use CEL filter for structural predicates over kind/apiVersion/" + + "metadata/spec/status/labels/annotations. Searches typed kinds plus CRDs " + + "already warmed in cache; cold CRDs need list_resources first to start watching.", Annotations: readOnly, }, logToolCall("search", handleSearch)) @@ -237,9 +286,14 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_workload_logs", - Description: "Get aggregated, AI-filtered logs from all pods of a workload (Deployment, StatefulSet, " + - "or DaemonSet). Logs are collected from all matching pods concurrently, filtered for errors/warnings, " + - "and deduplicated. More useful than get_pod_logs when you need logs across all replicas of a workload.", + Description: "Get aggregated logs from all pods of a workload (Deployment, StatefulSet, " + + "or DaemonSet). Logs are collected from all matching pods concurrently, then " + + "server-side filtered to errors, warnings, panics, and stack traces using " + + "deterministic regex patterns and deduplicated. Set grep for additional " + + "server-side filtering before that summary stage, like `kubectl logs | grep PATTERN`. " + + "More useful than get_pod_logs when you need logs across all replicas of a workload. " + + "If the target is a config value, feature flag, CRD field, env ref, or YAML/spec " + + "content, use search rather than logs.", Annotations: readOnly, }, logToolCall("get_workload_logs", handleGetWorkloadLogs)) @@ -310,14 +364,14 @@ func registerTools(server *mcp.Server) { // Tool input types type dashboardInput struct { - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace. Use when triaging one app/tenant namespace before drilling into individual resources."` } type listResourcesInput struct { - Kind string `json:"kind" jsonschema:"resource kind to list, e.g. pods, deployments, services, configmaps"` + Kind string `json:"kind" jsonschema:"resource kind to list for a broad sweep, e.g. pods, deployments, services, configmaps. Prefer this before get_resource when comparing many same-kind objects."` Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. serving.knative.dev for Knative Service vs core Service)"` - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` - Context string `json:"context,omitempty" jsonschema:"per-row context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare rows"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace for app-scoped triage"` + Context string `json:"context,omitempty" jsonschema:"per-row context: default attaches summaryContext (managedBy + health + issueCount) for suspect ranking; 'none' returns bare rows"` } type getResourceInput struct { @@ -325,12 +379,13 @@ type getResourceInput struct { Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. cluster.x-k8s.io for CAPI Cluster vs CNPG Cluster)"` Namespace string `json:"namespace" jsonschema:"resource namespace"` Name string `json:"name" jsonschema:"resource name"` - Include string `json:"include,omitempty" jsonschema:"comma-separated extras to include: events, relationships, metrics, logs"` + Include string `json:"include,omitempty" jsonschema:"optional sidecar data after narrowing to this object: events, relationships, metrics, logs. Separate from context; include may fetch heavier live/derived data."` + Context string `json:"context,omitempty" jsonschema:"resourceContext tier: default/basic attaches relationship and finding rollups (managedBy, exposes, selectedBy, uses, runsOn, issueSummary, auditSummary); 'none' returns a bare minified resource."` } type topologyInput struct { - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` - View string `json:"view,omitempty" jsonschema:"view mode: traffic for network flow or resources for ownership hierarchy"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace for a multi-service incident map; recommended unless you need cross-namespace topology"` + View string `json:"view,omitempty" jsonschema:"view mode: traffic for service routing/connectivity or resources for ownership hierarchy"` Format string `json:"format,omitempty" jsonschema:"output format: graph (default, full node/edge data) or summary (text description of resource chains)"` } @@ -354,14 +409,25 @@ type podLogsInput struct { Name string `json:"name" jsonschema:"pod name"` Container string `json:"container,omitempty" jsonschema:"container name, defaults to first container"` TailLines int `json:"tail_lines,omitempty" jsonschema:"number of lines to fetch from the end (default 200)"` + Grep string `json:"grep,omitempty" jsonschema:"optional regular expression to keep matching log lines before diagnostic filtering, like kubectl logs | grep PATTERN"` + Since string `json:"since,omitempty" jsonschema:"only return logs newer than this duration (e.g. 30s, 10m, 1h), like kubectl logs --since"` + Previous bool `json:"previous,omitempty" jsonschema:"return logs from the previous terminated container instance (e.g. for CrashLoopBackOff diagnosis), like kubectl logs -p"` } type searchInput struct { - Q string `json:"q" jsonschema:"search string. Free tokens AND'd. Modifiers: kind:Pod, ns:foo, label:k=v, image:redis"` + Query string `json:"query,omitempty" jsonschema:"search query for unknown resources or broad content scans. Free tokens AND'd. Matches identity plus searchable object content. Examples: adServiceFailure, kind:NetworkChaos delay, kind:ConfigMap flagd, image:flagd. Modifiers: kind:Pod, kind:NetworkChaos, ns:foo, label:k=v, image:redis"` + Q string `json:"q,omitempty" jsonschema:"alias for query"` Limit int `json:"limit,omitempty" jsonschema:"max hits returned (default 50, max 500)"` Include string `json:"include,omitempty" jsonschema:"per-hit detail: summary (default), raw, or none"` Filter string `json:"filter,omitempty" jsonschema:"optional CEL boolean expression run against each candidate K8s object. Bindings: kind, apiVersion, metadata, spec, status, labels, annotations. Use has(x.y) before optional fields. Examples: 'kind == \"Pod\" && status.phase == \"Failed\"', 'labels[\"app\"] == \"cart\"', 'has(status.readyReplicas) && status.readyReplicas == 0'"` - Context string `json:"context,omitempty" jsonschema:"per-hit context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare hits"` + Context string `json:"context,omitempty" jsonschema:"per-hit context: default attaches summaryContext (managedBy + health + issueCount) for suspect ranking; 'none' returns bare hits"` +} + +func (in searchInput) query() string { + if strings.TrimSpace(in.Query) != "" { + return in.Query + } + return in.Q } type issuesInput struct { @@ -616,43 +682,119 @@ func handleGetResource(ctx context.Context, req *mcp.CallToolRequest, input getR } } - // Try typed cache first. rawObj is the un-minified resource, threaded - // into attachResourceExtras so ManagedBy synthesis can disambiguate by - // group (avoids Knative Service vs core Service kind/plural collisions). + // Fetch the resource. When group is set, skip the typed cache and route + // directly to the dynamic cache: typed FetchResource is group-blind + // (e.g. for kind=services it returns the core Service regardless of any + // group qualifier), so a group-qualified call like serving.knative.dev/ + // Service would silently leak the wrong object. Mirrors the same + // group-first dispatch fix on the REST GET path in PR #721. var resourceData any - var rawObj any - obj, err := k8s.FetchResource(cache, kind, namespace, name) - if err == k8s.ErrUnknownKind { - // Fall through to dynamic cache for CRDs + var rawObj runtime.Object + if group != "" { u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) if dynErr != nil { return nil, nil, fmt.Errorf("resource not found: %w", dynErr) } resourceData = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) rawObj = u - } else if err != nil { - return nil, nil, fmt.Errorf("resource not found: %w", err) } else { - k8s.SetTypeMeta(obj) - minified, minErr := aicontext.Minify(obj, aicontext.LevelDetail) - if minErr != nil { - return nil, nil, fmt.Errorf("failed to minify: %w", minErr) + obj, err := k8s.FetchResource(cache, kind, namespace, name) + if err == k8s.ErrUnknownKind { + u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if dynErr != nil { + return nil, nil, fmt.Errorf("resource not found: %w", dynErr) + } + resourceData = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) + rawObj = u + } else if err != nil { + return nil, nil, fmt.Errorf("resource not found: %w", err) + } else { + k8s.SetTypeMeta(obj) + minified, minErr := aicontext.Minify(obj, aicontext.LevelDetail) + if minErr != nil { + return nil, nil, fmt.Errorf("failed to minify: %w", minErr) + } + resourceData = minified + rawObj = obj } - resourceData = minified - rawObj = obj } + // Build the resourceContext sidecar unless the caller opted out. Basic + // tier is the default: cheap managedBy / exposes / selectedBy / + // runsOn / uses / issueSummary / auditSummary / policySummary. Pass + // context=none for a bare minified resource (bulk scans, raw jq work). + contextMode := strings.ToLower(strings.TrimSpace(input.Context)) includes := parseIncludes(input.Include) - if len(includes) == 0 { + skipContext := contextMode == "none" + + var resourceCtx *resourcecontext.ResourceContext + if !skipContext { + resourceCtx = buildMCPResourceContext(ctx, rawObj, kind, namespace, name) + } + + // Three shapes: + // - bare resource: no includes, context=none + // - resource + resourceContext: no includes, default context + // - resource + resourceContext + extras: includes set + if len(includes) == 0 && resourceCtx == nil { return toJSONResult(resourceData) } - // Build enriched response with requested extras result := map[string]any{"resource": resourceData} - attachResourceExtras(ctx, cache, result, includes, kind, namespace, name, rawObj) + if resourceCtx != nil { + result["resourceContext"] = resourceCtx + } + if len(includes) > 0 { + attachResourceExtras(ctx, cache, result, includes, kind, namespace, name, rawObj) + } return toJSONResult(result) } +// buildMCPResourceContext assembles the resourceContext sidecar for MCP +// get_resource. Mirrors the REST handler's buildAIResourceContext: pre- +// computes IssueSummary + AuditSummary in the caller, threads the +// PolicyReport index when Kyverno is installed, hands a request-scoped +// RBAC checker to Build for per-ref gating, and lets Build's own +// fallback resolve Relationships via topology.GetRelationshipsWithObject +// (which applies KindForGVK so cross-group CRDs map to the right +// topology node). +func buildMCPResourceContext(ctx context.Context, obj runtime.Object, kind, namespace, name string) *resourcecontext.ResourceContext { + if obj == nil { + return nil + } + cache := k8s.GetResourceCache() + + gvk := obj.GetObjectKind().GroupVersionKind() + canonicalKind := gvk.Kind + if canonicalKind == "" { + canonicalKind = kind + } + canonicalGroup := gvk.Group + + issueSum := computeMCPIssueSummary(cache, canonicalGroup, canonicalKind, namespace, name) + auditSum := computeMCPAuditSummary(cache, canonicalGroup, canonicalKind, namespace, name) + + opts := resourcecontext.Options{ + Tier: resourcecontext.TierBasic, + AccessChecker: newMCPRequestScopedChecker(ctx), + IssueSummary: issueSum, + AuditSummary: auditSum, + ServiceBackends: mcpServiceBackendLookup{cache: cache}, + } + + if idx := k8s.GetPolicyReportIndex(); idx != nil { + opts.PolicyReports = mcpPolicyReportLookupAdapter{idx: idx} + } + + if topo, prov, dyn, ok := mcpTopologyForContext(namespace); ok { + opts.Topology = topo + opts.Provider = prov + opts.DynamicProv = dyn + } + + return resourcecontext.Build(ctx, obj, opts) +} + // attachResourceExtras populates optional extras (events, relationships, metrics, logs) // on the result map based on the includes set. rawObj is the already-fetched // resource (typed or *unstructured); passed through so relationship synthesis @@ -1203,9 +1345,20 @@ func handleGetPodLogs(ctx context.Context, req *mcp.CallToolRequest, input podLo if input.TailLines > 0 { tailLines = int64(input.TailLines) } + if strings.TrimSpace(input.Grep) != "" { + if _, err := regexp.Compile(input.Grep); err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } + } + sinceSeconds, err := parseLogsSince(input.Since) + if err != nil { + return nil, nil, err + } opts := &corev1.PodLogOptions{ - TailLines: &tailLines, + TailLines: &tailLines, + SinceSeconds: sinceSeconds, + Previous: input.Previous, } if input.Container != "" { opts.Container = input.Container @@ -1222,7 +1375,10 @@ func handleGetPodLogs(ctx context.Context, req *mcp.CallToolRequest, input podLo return nil, nil, fmt.Errorf("failed to read logs: %w", err) } - filtered := aicontext.FilterLogs(string(data)) + filtered, err := aicontext.FilterLogsByPattern(string(data), input.Grep) + if err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } return toJSONResult(filtered) } @@ -2041,7 +2197,11 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp if provider == nil { return nil, nil, fmt.Errorf("not connected to cluster") } - parsed := search.Parse(input.Q) + query := input.query() + if query == "" { + return nil, nil, fmt.Errorf("query is required") + } + parsed := search.Parse(query) allowed := filterNamespacesForUser(ctx, nil) if allowed != nil && len(allowed) == 0 { return toJSONResult(search.Result{Hits: []search.Hit{}}) diff --git a/internal/mcp/tools_apply.go b/internal/mcp/tools_apply.go index 5aea1dbbd..5a4206aab 100644 --- a/internal/mcp/tools_apply.go +++ b/internal/mcp/tools_apply.go @@ -88,4 +88,3 @@ func handleApplyResource(ctx context.Context, req *mcp.CallToolRequest, input ap "resources": results, }) } - diff --git a/internal/mcp/tools_audit.go b/internal/mcp/tools_audit.go index 7c60d8ab5..7211d1efd 100644 --- a/internal/mcp/tools_audit.go +++ b/internal/mcp/tools_audit.go @@ -16,14 +16,14 @@ type auditInput struct { Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` Category string `json:"category,omitempty" jsonschema:"filter by category: Security, Reliability, or Efficiency"` Severity string `json:"severity,omitempty" jsonschema:"filter by severity: danger or warning"` - Limit int `json:"limit,omitempty" jsonschema:"max findings to return (default 30, max 100)"` + Limit int `json:"limit,omitempty" jsonschema:"max audit violation findings to return (default 30, max 100). This limits findings only; compliant resources are not returned."` } type auditToolResult struct { - Summary auditSummary `json:"summary"` - Findings []auditFinding `json:"findings"` - TotalCount int `json:"totalCount"` - Truncated bool `json:"truncated,omitempty"` + Summary auditSummary `json:"summary"` + Findings []auditFinding `json:"findings"` + TotalCount int `json:"totalCount"` + Truncated bool `json:"truncated,omitempty"` } type auditSummary struct { @@ -34,10 +34,10 @@ type auditSummary struct { } type auditFinding struct { - Resource string `json:"resource"` // "Deployment/default/web" - Check string `json:"check"` // "runAsRoot" - Severity string `json:"severity"` // "danger" or "warning" - Category string `json:"category"` // "Security" + Resource string `json:"resource"` // "Deployment/default/web" + Check string `json:"check"` // "runAsRoot" + Severity string `json:"severity"` // "danger" or "warning" + Category string `json:"category"` // "Security" Message string `json:"message"` Remediation string `json:"remediation,omitempty"` } @@ -154,4 +154,3 @@ func loadAuditConfig() settings.AuditConfig { } return settings.DefaultAuditConfig() } - diff --git a/internal/mcp/tools_filter_test.go b/internal/mcp/tools_filter_test.go index 20c5c7227..d32973de2 100644 --- a/internal/mcp/tools_filter_test.go +++ b/internal/mcp/tools_filter_test.go @@ -636,6 +636,22 @@ func TestHandleSearch_Secrets_PerNamespaceFanout(t *testing.T) { } } +func TestHandleSearch_QueryAlias(t *testing.T) { + // Agents naturally call search with {"query": "..."}; keep accepting + // q, but make query work as the primary ergonomic field. + setupFakeCacheForFilterTests(t) + ctx := context.Background() + + result, _, err := handleSearch(ctx, nil, searchInput{Query: "alpha-pod"}) + if err != nil { + t.Fatalf("handleSearch: %v", err) + } + body := extractText(t, result) + if !containsName(body, "alpha-pod") { + t.Errorf("expected alpha-pod in search hits: %s", body) + } +} + func TestHandleSearch_Secrets_ClusterWideShape_NsFilter(t *testing.T) { // Regression for the bug where AllowedNamespaces==nil (cluster-wide // namespace sentinel) plus a concrete `ns:` filter took the cluster- @@ -666,3 +682,59 @@ func TestHandleSearch_Secrets_ClusterWideShape_NsFilter(t *testing.T) { t.Errorf("beta-secret leaked despite ns:alpha filter + per-ns RBAC: %s", body) } } + +func TestNormalizeWorkloadLogsKind_DefaultsToDeployment(t *testing.T) { + if got := normalizeWorkloadLogsKind(""); got != "deployments" { + t.Fatalf("blank workload-log kind = %q, want deployments", got) + } + if got := normalizeWorkloadLogsKind("statefulset"); got != "statefulsets" { + t.Fatalf("statefulset workload-log kind = %q, want statefulsets", got) + } +} + +func TestParseLogsSince(t *testing.T) { + tests := []struct { + name string + in string + wantSecs int64 + wantNil bool + wantError bool + }{ + {name: "empty returns nil", in: "", wantNil: true}, + {name: "whitespace returns nil", in: " ", wantNil: true}, + {name: "30s", in: "30s", wantSecs: 30}, + {name: "10m", in: "10m", wantSecs: 600}, + {name: "1h", in: "1h", wantSecs: 3600}, + {name: "sub-second floors to 1s", in: "500ms", wantSecs: 1}, + {name: "invalid format", in: "10minutes", wantError: true}, + {name: "negative duration", in: "-5m", wantError: true}, + {name: "zero duration", in: "0s", wantError: true}, + {name: "junk", in: "soon", wantError: true}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := parseLogsSince(tc.in) + if tc.wantError { + if err == nil { + t.Fatalf("expected error, got nil (result=%v)", got) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if tc.wantNil { + if got != nil { + t.Fatalf("expected nil, got %d", *got) + } + return + } + if got == nil { + t.Fatalf("expected %d, got nil", tc.wantSecs) + } + if *got != tc.wantSecs { + t.Fatalf("got %d, want %d", *got, tc.wantSecs) + } + }) + } +} diff --git a/internal/mcp/tools_helm.go b/internal/mcp/tools_helm.go index e4c805ff7..8600f31d3 100644 --- a/internal/mcp/tools_helm.go +++ b/internal/mcp/tools_helm.go @@ -33,8 +33,8 @@ type getHelmReleaseInput struct { Namespace string `json:"namespace" jsonschema:"release namespace"` Name string `json:"name" jsonschema:"release name"` Include string `json:"include,omitempty" jsonschema:"comma-separated extras to include: values, history, diff. Example: values,history"` - DiffRev1 int `json:"diff_revision_1,omitempty" jsonschema:"first revision for diff (requires include=diff)"` - DiffRev2 int `json:"diff_revision_2,omitempty" jsonschema:"second revision for diff (requires include=diff), defaults to current"` + DiffRev1 int `json:"diff_revision_1,omitempty" jsonschema:"first revision for diff; only used when include contains diff"` + DiffRev2 int `json:"diff_revision_2,omitempty" jsonschema:"second revision for diff; only used when include contains diff, defaults to current"` } // Helm tool handlers diff --git a/internal/mcp/tools_neighborhood.go b/internal/mcp/tools_neighborhood.go index a913267a5..b3a73e419 100644 --- a/internal/mcp/tools_neighborhood.go +++ b/internal/mcp/tools_neighborhood.go @@ -18,7 +18,7 @@ type getNeighborhoodInput struct { Group string `json:"group,omitempty" jsonschema:"API group required to disambiguate kinds that collide across groups. Examples: serving.knative.dev for KNative Service (vs core/v1 Service), cluster.x-k8s.io for CAPI Cluster (vs CNPG Cluster), networking.istio.io for Istio Gateway (vs gateway.networking.k8s.io Gateway). Omit for kinds with no known collisions."` Namespace string `json:"namespace,omitempty" jsonschema:"resource namespace; omit for cluster-scoped kinds"` Name string `json:"name" jsonschema:"resource name"` - Profile string `json:"profile,omitempty" jsonschema:"neighborhood breadth: auto or all. Default: auto (picks a bounded edge set from the root kind)."` + Profile string `json:"profile,omitempty" jsonschema:"neighborhood breadth: auto or all. Default: auto (picks a bounded edge set from the root kind). all expands every edge type and is heavier; use only when auto produced a too-narrow neighborhood."` Hops int `json:"hops,omitempty" jsonschema:"BFS depth. Default 1, max 2."` MaxNodes int `json:"max_nodes,omitempty" jsonschema:"node-budget cap. Default 25. When the cap is hit mid-expansion, truncated=true is set and the partial subgraph is returned."` } diff --git a/internal/mcp/tools_rbac.go b/internal/mcp/tools_rbac.go index 170727c08..7ab198976 100644 --- a/internal/mcp/tools_rbac.go +++ b/internal/mcp/tools_rbac.go @@ -35,12 +35,12 @@ type subjectPermissionsInput struct { } type subjectPermissionsResult struct { - Subject mcpSubject `json:"subject"` - Bindings []mcpBindingLite `json:"bindings"` + Subject mcpSubject `json:"subject"` + Bindings []mcpBindingLite `json:"bindings"` FlatRules []rbacv1.PolicyRule `json:"flatRules"` - Truncated bool `json:"truncated,omitempty"` - UsedByPods []string `json:"usedByPods,omitempty"` // "ns/name" pairs - PodsTotal int `json:"podsTotal,omitempty"` // >0 when usedByPods was truncated + Truncated bool `json:"truncated,omitempty"` + UsedByPods []string `json:"usedByPods,omitempty"` // "ns/name" pairs + PodsTotal int `json:"podsTotal,omitempty"` // >0 when usedByPods was truncated } type mcpSubject struct { @@ -53,7 +53,7 @@ type mcpSubject struct { // identify the binding and the role it grants; rule details are accessible // via get_resource on the role. type mcpBindingLite struct { - BindingKind string `json:"bindingKind"` // "RoleBinding" | "ClusterRoleBinding" + BindingKind string `json:"bindingKind"` // "RoleBinding" | "ClusterRoleBinding" BindingNamespace string `json:"bindingNamespace,omitempty"` BindingName string `json:"bindingName"` RoleKind string `json:"roleKind"` // "Role" | "ClusterRole" diff --git a/internal/mcp/tools_workloads.go b/internal/mcp/tools_workloads.go index ef990bba0..02e8a9379 100644 --- a/internal/mcp/tools_workloads.go +++ b/internal/mcp/tools_workloads.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "log" + "regexp" "sort" "strings" "sync" @@ -13,8 +14,8 @@ import ( "github.com/modelcontextprotocol/go-sdk/mcp" corev1 "k8s.io/api/core/v1" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" "github.com/skyhook-io/radar/internal/k8s" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" ) // Workload tool input types @@ -35,11 +36,37 @@ type manageCronJobInput struct { } type getWorkloadLogsInput struct { - Kind string `json:"kind" jsonschema:"workload kind: deployment, statefulset, or daemonset"` + Kind string `json:"kind,omitempty" jsonschema:"workload kind: deployment, statefulset, or daemonset. Defaults to deployment when omitted."` Namespace string `json:"namespace" jsonschema:"workload namespace"` Name string `json:"name" jsonschema:"workload name"` Container string `json:"container,omitempty" jsonschema:"specific container name, defaults to all containers"` TailLines int `json:"tail_lines,omitempty" jsonschema:"lines per pod (default 100)"` + Grep string `json:"grep,omitempty" jsonschema:"optional regular expression to keep matching log lines before diagnostic filtering, like kubectl logs | grep PATTERN"` + Since string `json:"since,omitempty" jsonschema:"only return logs newer than this duration (e.g. 30s, 10m, 1h), like kubectl logs --since"` + Previous bool `json:"previous,omitempty" jsonschema:"return logs from the previous terminated container instance (e.g. for CrashLoopBackOff diagnosis), like kubectl logs -p"` +} + +// parseLogsSince converts a relative duration string like "30s"/"10m"/"1h" +// into seconds for corev1.PodLogOptions.SinceSeconds. Empty input returns +// (nil, nil) so the caller can leave SinceSeconds unset. Negative or zero +// durations are rejected — kubectl's behavior on these is implementation- +// dependent and not useful for diagnosis. +func parseLogsSince(s string) (*int64, error) { + if strings.TrimSpace(s) == "" { + return nil, nil + } + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid since duration %q: %w (expected e.g. 30s, 10m, 1h)", s, err) + } + if d <= 0 { + return nil, fmt.Errorf("invalid since duration %q: must be positive", s) + } + secs := int64(d.Seconds()) + if secs < 1 { + secs = 1 + } + return &secs, nil } // Workload tool handlers @@ -141,7 +168,7 @@ func handleManageCronJob(ctx context.Context, req *mcp.CallToolRequest, input ma } func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input getWorkloadLogsInput) (*mcp.CallToolResult, any, error) { - kind := normalizeWorkloadKind(input.Kind) + kind := normalizeWorkloadLogsKind(input.Kind) if kind == "" { return nil, nil, fmt.Errorf("invalid kind %q: must be deployment, statefulset, or daemonset", input.Kind) } @@ -180,6 +207,15 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input if input.TailLines > 0 { tailLines = int64(input.TailLines) } + if strings.TrimSpace(input.Grep) != "" { + if _, err := regexp.Compile(input.Grep); err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } + } + sinceSeconds, err := parseLogsSince(input.Since) + if err != nil { + return nil, nil, err + } // Validate container name if specified if input.Container != "" { @@ -220,9 +256,11 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input defer wg.Done() opts := &corev1.PodLogOptions{ - Container: containerName, - TailLines: &tailLines, - Timestamps: true, + Container: containerName, + TailLines: &tailLines, + SinceSeconds: sinceSeconds, + Previous: input.Previous, + Timestamps: true, } entry := logEntry{ @@ -251,8 +289,8 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input return } - // Apply AI-optimized log filtering - entry.Logs = aicontext.FilterLogs(string(data)) + filtered, _ := aicontext.FilterLogsByPattern(string(data), input.Grep) + entry.Logs = filtered mu.Lock() allLogs = append(allLogs, entry) @@ -368,3 +406,10 @@ func normalizeWorkloadKind(kind string) string { return "" } } + +func normalizeWorkloadLogsKind(kind string) string { + if strings.TrimSpace(kind) == "" { + return "deployments" + } + return normalizeWorkloadKind(kind) +} diff --git a/internal/search/candidate.go b/internal/search/candidate.go index 2a2393ae5..c000807eb 100644 --- a/internal/search/candidate.go +++ b/internal/search/candidate.go @@ -1,6 +1,10 @@ package search import ( + "fmt" + "sort" + "strconv" + appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -25,6 +29,7 @@ func fromObject(obj runtime.Object, kind string) (candidate, bool) { Annotations: m.GetAnnotations(), } c.Images = imagesForTyped(obj) + c.Content = contentForTyped(obj, kind) return c, true } @@ -41,9 +46,81 @@ func fromUnstructured(u *unstructured.Unstructured, kind, group string) candidat Annotations: u.GetAnnotations(), } c.Images = imagesFromUnstructured(u) + c.Content = contentFromUnstructured(u, kind) return c } +func contentForTyped(obj runtime.Object, kind string) []ContentField { + if obj == nil { + return nil + } + // Secrets are intentionally not content-indexed. Search may expose Secret + // names to callers with Secret RBAC, but matching/snippeting data values + // would turn search into a secret-value disclosure path. + if kind == "Secret" { + return nil + } + m, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil + } + return contentFromMap(m, kind) +} + +func contentFromUnstructured(u *unstructured.Unstructured, kind string) []ContentField { + if u == nil || u.Object == nil || kind == "Secret" { + return nil + } + return contentFromMap(u.Object, kind) +} + +func contentFromMap(obj map[string]any, kind string) []ContentField { + var out []ContentField + if kind == "ConfigMap" { + walkContent("data", obj["data"], &out) + walkContent("binaryData", obj["binaryData"], &out) + return out + } + // These roots capture the useful grep-like surface without indexing noisy + // metadata such as managedFields or leaking Secret data values. + walkContent("spec", obj["spec"], &out) + walkContent("status", obj["status"], &out) + walkContent("data", obj["data"], &out) + return out +} + +func walkContent(path string, v any, out *[]ContentField) { + switch x := v.(type) { + case nil: + return + case string: + if x != "" { + *out = append(*out, ContentField{Path: path, Value: x}) + } + case bool: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatBool(x)}) + case int: + *out = append(*out, ContentField{Path: path, Value: strconv.Itoa(x)}) + case int64: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatInt(x, 10)}) + case float64: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatFloat(x, 'f', -1, 64)}) + case map[string]any: + keys := make([]string, 0, len(x)) + for k := range x { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + walkContent(path+"."+k, x[k], out) + } + case []any: + for i, item := range x { + walkContent(fmt.Sprintf("%s[%d]", path, i), item, out) + } + } +} + func imagesForTyped(obj runtime.Object) []string { switch o := obj.(type) { case *corev1.Pod: diff --git a/internal/search/score.go b/internal/search/score.go index 16a407cc1..c877237e1 100644 --- a/internal/search/score.go +++ b/internal/search/score.go @@ -13,8 +13,11 @@ const ( scoreLabelValSubstr = 18 scoreAnnoSubstr = 15 scoreImageSubstr = 20 + scoreContentExact = 16 + scoreContentSubstr = 12 scoreKindExact = 10 scoreKindSubstr = 5 + maxSnippetRunes = 180 ) // candidate carries the searchable face of a K8s object: identity, @@ -28,63 +31,84 @@ type candidate struct { Labels map[string]string Annotations map[string]string Images []string + Content []ContentField +} + +// ContentField is a searchable string extracted from object content, such as +// ConfigMap data, workload env refs, CRD spec fields, or status messages. +type ContentField struct { + Path string + Value string } // match runs the parsed query against a candidate and returns the score // plus which sites matched. Returns (0, nil, false) when filters reject // the candidate or when at least one free token didn't land anywhere. -func match(q Query, c candidate) (int, []MatchedField, bool) { +func match(q Query, c candidate) (int, []MatchedField, []MatchSnippet, bool) { // Hard filters first — cheaper to reject early. if len(q.KindFilter) > 0 && !kindMatches(c.Kind, q.KindFilter) { - return 0, nil, false + return 0, nil, nil, false } if len(q.NSFilter) > 0 && !sliceContainsFold(q.NSFilter, c.Namespace) { - return 0, nil, false + return 0, nil, nil, false } for _, lf := range q.LabelFilter { v, ok := c.Labels[lf.Key] if !ok { - return 0, nil, false + return 0, nil, nil, false } if lf.Value != "" && v != lf.Value { - return 0, nil, false + return 0, nil, nil, false } } for _, img := range q.ImageFilter { if !anyContainsFold(c.Images, img) { - return 0, nil, false + return 0, nil, nil, false } } if len(q.Tokens) == 0 { // Pure-filter query: no scoring signal, but the candidate passed // every filter, so return a flat score so it shows up. - return 1, nil, true + return 1, nil, nil, true } total := 0 var matched []MatchedField + var snippets []MatchSnippet for _, tok := range q.Tokens { - best, site, ok := scoreToken(tok, c) + best, site, snippet, ok := scoreToken(tok, c) if !ok { - return 0, nil, false + return 0, nil, nil, false } total += best matched = append(matched, MatchedField{Token: tok, Site: site, Score: best}) + if snippet != nil { + snippets = append(snippets, *snippet) + } } - return total, matched, true + return total, matched, snippets, true } // scoreToken returns the highest-scoring site a single free token matches, // or (0, "", false) if the token doesn't land on any searchable field. -func scoreToken(tok string, c candidate) (int, string, bool) { +func scoreToken(tok string, c candidate) (int, string, *MatchSnippet, bool) { low := strings.ToLower(tok) best := 0 bestSite := "" + var bestSnippet *MatchSnippet consider := func(score int, site string) { if score > best { best = score bestSite = site + bestSnippet = nil + } + } + considerSnippet := func(score int, site string, snip MatchSnippet) { + if score > best { + best = score + bestSite = site + bestSnippet = &snip } } @@ -127,6 +151,26 @@ func scoreToken(tok string, c candidate) (int, string, bool) { consider(scoreImageSubstr, "image") } } + for _, cf := range c.Content { + if cf.Value == "" { + continue + } + vLow := strings.ToLower(cf.Value) + switch { + case vLow == low: + considerSnippet(scoreContentExact, "content:"+cf.Path, MatchSnippet{ + Token: tok, + Path: cf.Path, + Snippet: snippetForToken(cf.Value, tok), + }) + case strings.Contains(vLow, low): + considerSnippet(scoreContentSubstr, "content:"+cf.Path, MatchSnippet{ + Token: tok, + Path: cf.Path, + Snippet: snippetForToken(cf.Value, tok), + }) + } + } if c.Kind != "" { kindLow := strings.ToLower(c.Kind) switch { @@ -138,9 +182,44 @@ func scoreToken(tok string, c candidate) (int, string, bool) { } if best == 0 { - return 0, "", false + return 0, "", nil, false + } + return best, bestSite, bestSnippet, true +} + +func snippetForToken(value, tok string) string { + runes := []rune(value) + if len(runes) <= maxSnippetRunes { + return value + } + valueLow := strings.ToLower(value) + tokLow := strings.ToLower(tok) + byteIdx := strings.Index(valueLow, tokLow) + if byteIdx < 0 { + return string(runes[:maxSnippetRunes]) + } + prefixRunes := len([]rune(value[:byteIdx])) + half := maxSnippetRunes / 2 + start := prefixRunes - half + if start < 0 { + start = 0 + } + end := start + maxSnippetRunes + if end > len(runes) { + end = len(runes) + start = end - maxSnippetRunes + if start < 0 { + start = 0 + } + } + snippet := string(runes[start:end]) + if start > 0 { + snippet = "..." + snippet + } + if end < len(runes) { + snippet += "..." } - return best, bestSite, true + return snippet } // kindMatches returns true if any of the kind filters refer to the candidate kind. diff --git a/internal/search/score_test.go b/internal/search/score_test.go index 000f1a3be..16813b523 100644 --- a/internal/search/score_test.go +++ b/internal/search/score_test.go @@ -1,6 +1,9 @@ package search -import "testing" +import ( + "strings" + "testing" +) func cand() candidate { return candidate{ @@ -15,7 +18,7 @@ func cand() candidate { func TestMatch_FreeTokenScoresHighestSite(t *testing.T) { q := Parse("redis") - score, _, ok := match(q, cand()) + score, _, _, ok := match(q, cand()) if !ok { t.Fatal("expected match") } @@ -27,7 +30,7 @@ func TestMatch_FreeTokenScoresHighestSite(t *testing.T) { func TestMatch_TwoTokensSummed(t *testing.T) { q := Parse("redis cache") - score, matched, ok := match(q, cand()) + score, matched, _, ok := match(q, cand()) if !ok { t.Fatal("expected match") } @@ -42,57 +45,57 @@ func TestMatch_TwoTokensSummed(t *testing.T) { func TestMatch_TokenMustMatchSomewhere(t *testing.T) { q := Parse("redis nope-not-here") - if _, _, ok := match(q, cand()); ok { + if _, _, _, ok := match(q, cand()); ok { t.Fatal("expected no match — second token must reject") } } func TestMatch_KindFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("kind:Service"), c); ok { + if _, _, _, ok := match(Parse("kind:Service"), c); ok { t.Fatal("kind:Service should reject a Pod candidate") } - if _, _, ok := match(Parse("kind:Pod"), c); !ok { + if _, _, _, ok := match(Parse("kind:Pod"), c); !ok { t.Fatal("kind:Pod should match a Pod candidate") } // Pluralized form too — radar fetch.go uses lowercase plural keys. - if _, _, ok := match(Parse("kind:pods"), c); !ok { + if _, _, _, ok := match(Parse("kind:pods"), c); !ok { t.Fatal("kind:pods should match") } } func TestMatch_NSFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("ns:dev"), c); ok { + if _, _, _, ok := match(Parse("ns:dev"), c); ok { t.Fatal("ns:dev should reject prod candidate") } - if _, _, ok := match(Parse("ns:prod"), c); !ok { + if _, _, _, ok := match(Parse("ns:prod"), c); !ok { t.Fatal("ns:prod should match") } } func TestMatch_LabelFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("label:app=redis"), c); !ok { + if _, _, _, ok := match(Parse("label:app=redis"), c); !ok { t.Fatal("label:app=redis should match") } - if _, _, ok := match(Parse("label:app=postgres"), c); ok { + if _, _, _, ok := match(Parse("label:app=postgres"), c); ok { t.Fatal("label:app=postgres should reject") } - if _, _, ok := match(Parse("label:app"), c); !ok { + if _, _, _, ok := match(Parse("label:app"), c); !ok { t.Fatal("label:app (key-only) should match when label exists") } - if _, _, ok := match(Parse("label:missing"), c); ok { + if _, _, _, ok := match(Parse("label:missing"), c); ok { t.Fatal("label:missing should reject when label absent") } } func TestMatch_ImageFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("image:redis"), c); !ok { + if _, _, _, ok := match(Parse("image:redis"), c); !ok { t.Fatal("image:redis should match") } - if _, _, ok := match(Parse("image:nginx"), c); ok { + if _, _, _, ok := match(Parse("image:nginx"), c); ok { t.Fatal("image:nginx should reject") } } @@ -100,7 +103,7 @@ func TestMatch_ImageFilter(t *testing.T) { func TestMatch_PureFilterReturnsFlatScore(t *testing.T) { // Filter-only query (no free tokens) should return a positive flat // score so candidates show up at all. - score, _, ok := match(Parse("kind:Pod ns:prod"), cand()) + score, _, _, ok := match(Parse("kind:Pod ns:prod"), cand()) if !ok || score <= 0 { t.Fatalf("filter-only match: score=%d ok=%v", score, ok) } @@ -108,11 +111,32 @@ func TestMatch_PureFilterReturnsFlatScore(t *testing.T) { func TestMatch_CaseInsensitive(t *testing.T) { q := Parse("REDIS") - if _, _, ok := match(q, cand()); !ok { + if _, _, _, ok := match(q, cand()); !ok { t.Fatal("expected case-insensitive match") } } +func TestMatch_ContentSnippet(t *testing.T) { + c := cand() + c.Content = []ContentField{{ + Path: "data.flags.json", + Value: `{"adServiceFailure":{"defaultVariant":"on"}}`, + }} + score, matched, snippets, ok := match(Parse("adServiceFailure"), c) + if !ok { + t.Fatal("expected content match") + } + if score != scoreContentSubstr { + t.Fatalf("score=%d, expected content score %d", score, scoreContentSubstr) + } + if len(matched) != 1 || matched[0].Site != "content:data.flags.json" { + t.Fatalf("matched=%+v", matched) + } + if len(snippets) != 1 || snippets[0].Path != "data.flags.json" || !strings.Contains(snippets[0].Snippet, "adServiceFailure") { + t.Fatalf("snippets=%+v", snippets) + } +} + func TestKindMatches_Variants(t *testing.T) { cases := []struct { kind, filter string diff --git a/internal/search/search.go b/internal/search/search.go index df7b5da01..ccb536e0c 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -213,7 +213,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } c.Group = tk.Group - score, matched, ok := match(q, c) + score, matched, snippets, ok := match(q, c) if !ok { continue } @@ -242,7 +242,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } pending = append(pending, pendingHit{ - hit: buildHit(score, matched, c, opts.Include, obj, nil), + hit: buildHit(score, matched, snippets, c, opts.Include, obj, nil), obj: obj, c: c, }) @@ -285,7 +285,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err res.Searched += len(items) for _, u := range items { c := fromUnstructured(u, kind, gvr.Group) - score, matched, ok := match(q, c) + score, matched, snippets, ok := match(q, c) if !ok { continue } @@ -312,7 +312,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } pending = append(pending, pendingHit{ - hit: buildHit(score, matched, c, opts.Include, nil, u), + hit: buildHit(score, matched, snippets, c, opts.Include, nil, u), u: u, c: c, }) @@ -407,7 +407,7 @@ func isClusterScopedKind(kind string) bool { // done here — it happens in Search's post-truncation loop so the // expensive topology lookups + issue-index reads only run for the hits // that survive sort + Limit truncation. -func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { +func buildHit(score int, matched []MatchedField, snippets []MatchSnippet, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { h := Hit{ Score: score, Kind: c.Kind, @@ -415,6 +415,7 @@ func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, Namespace: c.Namespace, Name: c.Name, Matched: matched, + Snippets: snippets, } switch mode { case IncludeSummary: diff --git a/internal/search/search_test.go b/internal/search/search_test.go index 91f8ed0dc..4b97e51cf 100644 --- a/internal/search/search_test.go +++ b/internal/search/search_test.go @@ -118,6 +118,69 @@ func TestSearch_ImageMatch(t *testing.T) { } } +func TestSearch_ConfigMapDataMatchWithSnippet(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "configmaps": { + &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "flagd-config", Namespace: "astronomy-shop"}, + Data: map[string]string{ + "flags.json": `{"adServiceFailure":{"defaultVariant":"on"}}`, + }, + }, + }, + }, + } + res, err := Search(context.Background(), p, Parse("adServiceFailure"), Options{Include: IncludeNone}) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected configmap content hit, got %+v", res.Hits) + } + h := res.Hits[0] + if h.Kind != "ConfigMap" || h.Name != "flagd-config" { + t.Fatalf("wrong hit: %+v", h) + } + if len(h.Snippets) != 1 || h.Snippets[0].Path != "data.flags.json" { + t.Fatalf("expected data snippet, got %+v", h.Snippets) + } +} + +func TestSearch_DynamicSpecMatchWithSnippet(t *testing.T) { + gvr := schema.GroupVersionResource{Group: "chaos-mesh.org", Version: "v1alpha1", Resource: "networkchaos"} + u := &unstructured.Unstructured{Object: map[string]any{ + "apiVersion": "chaos-mesh.org/v1alpha1", + "kind": "NetworkChaos", + "metadata": map[string]any{ + "name": "net-fault", + "namespace": "hotel", + }, + "spec": map[string]any{ + "selector": map[string]any{ + "labelSelectors": map[string]any{ + "app": "user", + }, + }, + "action": "delay", + }, + }} + p := &fakeProvider{ + dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{gvr: {u}}, + kinds: map[schema.GroupVersionResource]string{gvr: "NetworkChaos"}, + } + res, err := Search(context.Background(), p, Parse("delay"), Options{Include: IncludeNone}) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected CRD content hit, got %+v", res.Hits) + } + if len(res.Hits[0].Snippets) != 1 || res.Hits[0].Snippets[0].Path != "spec.action" { + t.Fatalf("expected spec snippet, got %+v", res.Hits[0].Snippets) + } +} + func TestSearch_LimitTruncates(t *testing.T) { pods := make([]runtime.Object, 0, 100) for i := 0; i < 100; i++ { diff --git a/internal/search/types.go b/internal/search/types.go index b33215853..b1d650fc5 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -49,6 +49,7 @@ type Hit struct { Summary any `json:"summary,omitempty"` Raw any `json:"raw,omitempty"` Matched []MatchedField `json:"matched,omitempty"` + Snippets []MatchSnippet `json:"snippets,omitempty"` // SummaryContext is the compact per-row enrichment (managedBy, health, // issueCount). Populated by handlers via Options.SummaryBuilder; nil // when the caller opted out (context=none) or no fields apply. @@ -58,10 +59,19 @@ type Hit struct { // MatchedField records where a query token landed (debug + UI highlight). type MatchedField struct { Token string `json:"token"` - Site string `json:"site"` // "name" | "namespace" | "label:k" | "annotation:k" | "image" | "kind" + Site string `json:"site"` // "name" | "namespace" | "label:k" | "annotation:k" | "image" | "kind" | "content:path" Score int `json:"score"` } +// MatchSnippet is a short excerpt from a content field that matched a free +// token. It lets agents use search as a cheap grep-like first pass without +// fetching the full resource body for every hit. +type MatchSnippet struct { + Token string `json:"token"` + Path string `json:"path"` + Snippet string `json:"snippet"` +} + // Result is the full response shape for a search request. type Result struct { Hits []Hit `json:"hits"` diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 40db87bc9..68511d576 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -17,10 +17,11 @@ // two protocols. // // Revisit this opt-out when: -// (a) the agent surface stabilizes (no major shape changes for two -// release cycles), AND -// (b) Skyhook commits to a public customer-facing AI SDK that needs -// generated bindings. +// +// (a) the agent surface stabilizes (no major shape changes for two +// release cycles), AND +// (b) Skyhook commits to a public customer-facing AI SDK that needs +// generated bindings. // // Until both conditions are met, bringing /api/ai/* under openapi.yaml // is premature — it would pay the spec-authoring tax during evolution @@ -35,7 +36,9 @@ import ( "strings" "github.com/go-chi/chi/v5" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/audit" @@ -80,6 +83,17 @@ func (a policyReportLookupAdapter) FindingsFor(group, kind, namespace, name stri return out } +type serviceBackendLookup struct { + cache *k8s.ResourceCache +} + +func (l serviceBackendLookup) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + if l.cache == nil || l.cache.Pods() == nil { + return nil, nil + } + return l.cache.Pods().Pods(namespace).List(selector) +} + // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aicontext.VerbosityLevel { switch r.URL.Query().Get("verbosity") { @@ -410,10 +424,11 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin auditSum := computeAuditSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) opts := resourcecontext.Options{ - Tier: resourcecontext.TierBasic, - AccessChecker: s.newRequestScopedChecker(r), - IssueSummary: issueSum, - AuditSummary: auditSum, + Tier: resourcecontext.TierBasic, + AccessChecker: s.newRequestScopedChecker(r), + IssueSummary: issueSum, + AuditSummary: auditSum, + ServiceBackends: serviceBackendLookup{cache: cache}, } // Wire the PolicyReport index when Kyverno is installed. Build emits a diff --git a/pkg/ai/context/logs.go b/pkg/ai/context/logs.go index 50e09e82d..cd0b7192c 100644 --- a/pkg/ai/context/logs.go +++ b/pkg/ai/context/logs.go @@ -74,6 +74,26 @@ func FilterLogs(rawLogs string) FilteredLogs { } } +// FilterLogsByPattern first keeps only lines matching pattern, then applies +// the usual diagnostic log filtering. This gives agents a server-side +// equivalent of `kubectl logs ... | grep PATTERN | tail`. +func FilterLogsByPattern(rawLogs, pattern string) (FilteredLogs, error) { + if strings.TrimSpace(pattern) == "" { + return FilterLogs(rawLogs), nil + } + re, err := regexp.Compile(pattern) + if err != nil { + return FilteredLogs{}, err + } + var matched []string + for _, line := range strings.Split(strings.TrimRight(rawLogs, "\n"), "\n") { + if re.MatchString(line) { + matched = append(matched, line) + } + } + return FilterLogs(strings.Join(matched, "\n")), nil +} + // deduplicateStackTraces collapses identical consecutive lines with a repeat count. func deduplicateStackTraces(lines []string) []string { if len(lines) == 0 { diff --git a/pkg/ai/context/logs_test.go b/pkg/ai/context/logs_test.go index c129efd28..f2a366a53 100644 --- a/pkg/ai/context/logs_test.go +++ b/pkg/ai/context/logs_test.go @@ -180,3 +180,30 @@ func TestFilterLogs_RedactsSecrets(t *testing.T) { t.Errorf("Secret not redacted in log line: %s", result.Lines[0]) } } + +func TestFilterLogsByPattern_FiltersBeforeSummary(t *testing.T) { + lines := []string{ + "INFO checkout request ok", + "INFO cart request slow", + "INFO recommendation request ok", + } + input := strings.Join(lines, "\n") + + result, err := FilterLogsByPattern(input, "cart") + if err != nil { + t.Fatalf("FilterLogsByPattern returned error: %v", err) + } + if result.TotalLines != 1 { + t.Errorf("Expected TotalLines=1 after grep, got %d", result.TotalLines) + } + if len(result.Lines) != 1 || !strings.Contains(result.Lines[0], "cart request slow") { + t.Fatalf("Expected cart line, got %#v", result.Lines) + } +} + +func TestFilterLogsByPattern_InvalidRegex(t *testing.T) { + _, err := FilterLogsByPattern("INFO ok", "[") + if err == nil { + t.Fatal("Expected invalid regex error") + } +} diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 49170abf9..856f0dbe4 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -3,6 +3,7 @@ package resourcecontext import ( "context" "sort" + "strings" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -14,6 +15,7 @@ import ( storagev1 "k8s.io/api/storage/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/pkg/topology" @@ -58,6 +60,12 @@ type Options struct { IssueSummary *IssueSummary AuditSummary *AuditSummary PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings + + // Optional kind-specific lookups. ServiceBackends is used only for + // Service resources to attach realized pod-selection state. The raw + // Service spec already carries selector/ports; this lookup answers + // whether that selector currently resolves to ready Pods. + ServiceBackends ServiceBackendLookup } // PolicyReportLookup is the minimal interface Build needs from the @@ -69,6 +77,10 @@ type PolicyReportLookup interface { FindingsFor(group, kind, namespace, name string) []KyvernoFinding } +type ServiceBackendLookup interface { + PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) +} + // RefAccessChecker abstracts the RBAC check so this package doesn't import // any internal/* package. REST and MCP handlers each implement this with a // request-scoped batch cache (see internal/server/rc_rbac.go). @@ -102,8 +114,9 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // topology.Relationships the canonical projection: server-side // SynthesizeManagedBy walks the owner chain + GitOps signals, and the Pod // hygiene fields (.ServiceAccount, .Node) are populated from pod.Spec. - // We do NOT re-walk owner refs here — that would duplicate the topology - // package's logic and risk drift. + // ManagedBy stays delegated to topology to avoid duplicating its owner-chain + // and GitOps logic. The direct Owner field below may still fall back to the + // object's controller OwnerReference when topology is absent or cold. // // Single-resource callers (REST GET, MCP get_resource) leave // opts.Relationships nil and let us compute via GetRelationshipsWithObject @@ -144,6 +157,26 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte "managedBy", omitted) } + if rel != nil && rel.Owner != nil { + candidate := &ContextRef{ + Kind: rel.Owner.Kind, + Group: rel.Owner.Group, + Namespace: rel.Owner.Namespace, + Name: rel.Owner.Name, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.Owner = candidate + } else { + omitted.add("owner", OmittedRBACDenied) + } + } else if owner := ownerFromObject(obj, ident.Namespace); owner != nil { + if checkRef(ctx, opts.AccessChecker, owner) { + rc.Owner = owner + } else { + omitted.add("owner", OmittedRBACDenied) + } + } + // 2. Topology-derived: Exposes, SelectedBy, ScaledBy if rel != nil { exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) @@ -176,6 +209,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // Build needs. if pod, ok := obj.(*corev1.Pod); ok { rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) + rc.PodSummary = buildPodSummary(pod) // Prefer rel.ServiceAccount over re-reading pod.Spec — same source, // but consolidating through Relationships keeps Build aligned with @@ -222,6 +256,20 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } } + if svc, ok := obj.(*corev1.Service); ok { + rc.ServiceSummary = buildServiceSummary(ctx, svc, opts.ServiceBackends, opts.AccessChecker, omitted) + } + if uses := buildUsesFromWorkload(ctx, obj, opts.AccessChecker, omitted); uses != nil { + rc.Uses = uses + } + rc.WorkloadSummary = buildWorkloadSummary(obj) + rc.IngressSummary = buildIngressSummary(ctx, obj, opts.AccessChecker, omitted) + rc.NodeSummary = buildNodeSummary(obj) + rc.PVCSummary = buildPVCSummary(obj) + rc.JobSummary = buildJobSummary(obj) + rc.CronJobSummary = buildCronJobSummary(ctx, obj, opts.AccessChecker, omitted) + rc.StatusSummary = buildStatusSummary(obj) + // 4. Pre-computed summaries — pass-through. rc.IssueSummary = opts.IssueSummary rc.AuditSummary = opts.AuditSummary @@ -335,6 +383,37 @@ func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { } } +func ownerFromObject(obj runtime.Object, namespace string) *ContextRef { + m, ok := obj.(metav1.Object) + if !ok { + return nil + } + owners := m.GetOwnerReferences() + if len(owners) == 0 { + return nil + } + chosen := owners[0] + for _, owner := range owners { + if owner.Controller != nil && *owner.Controller { + chosen = owner + break + } + } + return &ContextRef{ + Kind: chosen.Kind, + Group: groupFromAPIVersion(chosen.APIVersion), + Namespace: namespace, + Name: chosen.Name, + } +} + +func groupFromAPIVersion(apiVersion string) string { + if apiVersion == "" || !strings.Contains(apiVersion, "/") { + return "" + } + return strings.SplitN(apiVersion, "/", 2)[0] +} + // --------------------------------------------------------------------------- // Uses (Pod-specific) // --------------------------------------------------------------------------- @@ -350,14 +429,17 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, if pod == nil { return nil } + return buildUsesFromPodSpec(ctx, pod.Namespace, pod.Spec, ac, omitted) +} +func buildUsesFromPodSpec(ctx context.Context, namespace string, spec corev1.PodSpec, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { cmSet := newRefSet() secretSet := newRefSet() pvcSet := newRefSet() - scanVolumes(pod.Spec.Volumes, pod.Namespace, cmSet, secretSet, pvcSet) - scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) - scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) + scanVolumes(spec.Volumes, namespace, cmSet, secretSet, pvcSet) + scanContainers(spec.InitContainers, namespace, cmSet, secretSet) + scanContainers(spec.Containers, namespace, cmSet, secretSet) uses := &UsesBlock{ ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", ""), "uses.configMaps", omitted), @@ -365,10 +447,10 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", ""), "uses.pvcs", omitted), } - if sa := pod.Spec.ServiceAccountName; sa != "" { + if sa := spec.ServiceAccountName; sa != "" { candidate := &ContextRef{ Kind: "ServiceAccount", - Namespace: pod.Namespace, + Namespace: namespace, Name: sa, } if checkRef(ctx, ac, candidate) { @@ -384,6 +466,25 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, return uses } +func buildUsesFromWorkload(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + switch v := obj.(type) { + case *appsv1.Deployment: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.StatefulSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.DaemonSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.ReplicaSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *batchv1.Job: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *batchv1.CronJob: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.JobTemplate.Spec.Template.Spec, ac, omitted) + default: + return nil + } +} + func scanVolumes(vols []corev1.Volume, ns string, cm, secret, pvc *refSet) { for _, v := range vols { if v.ConfigMap != nil { @@ -432,6 +533,480 @@ func scanContainers(containers []corev1.Container, ns string, cm, secret *refSet } } +const maxServicePodRefs = 10 + +func buildServiceSummary(ctx context.Context, svc *corev1.Service, lookup ServiceBackendLookup, ac RefAccessChecker, omitted *omittedTracker) *ServiceSummary { + if svc == nil { + return nil + } + out := &ServiceSummary{} + + if len(svc.Spec.Selector) == 0 { + if svc.Spec.Type != corev1.ServiceTypeExternalName { + out.Warnings = append(out.Warnings, ServiceWarningNoSelector) + } + return out + } + if lookup == nil { + return nil + } + + selector := labels.SelectorFromSet(labels.Set(svc.Spec.Selector)) + pods, err := lookup.PodsForServiceSelector(svc.Namespace, selector) + if err != nil { + return nil + } + + sel := &PodSelectionSummary{Total: len(pods)} + for _, pod := range pods { + ref := ContextRef{Kind: "Pod", Namespace: pod.Namespace, Name: pod.Name} + if !checkRef(ctx, ac, &ref) { + omitted.add("serviceSummary.selectedPods", OmittedRBACDenied) + continue + } + if isPodReady(pod) { + sel.Ready++ + appendBoundedPodRef(&sel.ReadyPods, ref, sel) + } else { + sel.NotReady++ + appendBoundedPodRef(&sel.NotReadyPods, ref, sel) + } + } + + if sel.Total == 0 { + out.Warnings = append(out.Warnings, ServiceWarningNoSelectedPods) + } else if sel.Ready == 0 { + out.Warnings = append(out.Warnings, ServiceWarningNoReadyPods) + } + out.SelectedPods = sel + return out +} + +func appendBoundedPodRef(dst *[]ContextRef, ref ContextRef, sel *PodSelectionSummary) { + if len(*dst) >= maxServicePodRefs { + sel.Truncated = true + return + } + *dst = append(*dst, ref) +} + +func isPodReady(pod *corev1.Pod) bool { + if pod == nil { + return false + } + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady { + return cond.Status == corev1.ConditionTrue + } + } + return false +} + +const maxSummaryItems = 12 + +func buildPodSummary(pod *corev1.Pod) *PodSummary { + if pod == nil { + return nil + } + out := &PodSummary{ + Phase: string(pod.Status.Phase), + Ready: isPodReady(pod), + } + statuses := make([]corev1.ContainerStatus, 0, len(pod.Status.InitContainerStatuses)+len(pod.Status.ContainerStatuses)) + statuses = append(statuses, pod.Status.InitContainerStatuses...) + statuses = append(statuses, pod.Status.ContainerStatuses...) + if len(statuses) > maxSummaryItems { + statuses = statuses[:maxSummaryItems] + } + for _, st := range statuses { + out.RestartCount += st.RestartCount + out.Containers = append(out.Containers, containerStateSummary(st)) + } + return out +} + +func containerStateSummary(st corev1.ContainerStatus) ContainerStateSummary { + out := ContainerStateSummary{ + Name: st.Name, + Ready: st.Ready, + RestartCount: st.RestartCount, + } + switch { + case st.State.Waiting != nil: + out.State = "waiting" + out.Reason = st.State.Waiting.Reason + case st.State.Running != nil: + out.State = "running" + case st.State.Terminated != nil: + out.State = "terminated" + out.Reason = st.State.Terminated.Reason + } + if st.LastTerminationState.Terminated != nil { + out.LastTerminationReason = st.LastTerminationState.Terminated.Reason + } + return out +} + +func buildWorkloadSummary(obj runtime.Object) *WorkloadSummary { + switch v := obj.(type) { + case *appsv1.Deployment: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Updated: v.Status.UpdatedReplicas, + Unavailable: v.Status.UnavailableReplicas, + }} + case *appsv1.StatefulSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Updated: v.Status.UpdatedReplicas, + }} + case *appsv1.DaemonSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: v.Status.DesiredNumberScheduled, + Ready: v.Status.NumberReady, + Available: v.Status.NumberAvailable, + Updated: v.Status.UpdatedNumberScheduled, + Unavailable: v.Status.NumberUnavailable, + }} + case *appsv1.ReplicaSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Unavailable: maxInt32(0, v.Status.Replicas-v.Status.AvailableReplicas), + }} + default: + return nil + } +} + +func buildIngressSummary(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *IngressSummary { + ing, ok := obj.(*networkingv1.Ingress) + if !ok || ing == nil { + return nil + } + out := &IngressSummary{} + if ing.Spec.IngressClassName != nil { + out.Class = *ing.Spec.IngressClassName + } else if ing.Annotations != nil { + out.Class = ing.Annotations["kubernetes.io/ingress.class"] + } + for _, addr := range ing.Status.LoadBalancer.Ingress { + if addr.IP != "" { + out.Addresses = append(out.Addresses, addr.IP) + } else if addr.Hostname != "" { + out.Addresses = append(out.Addresses, addr.Hostname) + } + } + if len(out.Addresses) == 0 { + out.Warnings = append(out.Warnings, IngressWarningNoAddress) + } + if out.Class == "" { + out.Warnings = append(out.Warnings, IngressWarningNoClass) + } + if len(ing.Spec.Rules) == 0 { + out.Warnings = append(out.Warnings, IngressWarningNoRules) + } + + svcSet := newRefSet() + addIngressBackendService(svcSet, ing.Namespace, ing.Spec.DefaultBackend) + for _, rule := range ing.Spec.Rules { + if rule.HTTP == nil { + continue + } + for _, path := range rule.HTTP.Paths { + addIngressBackendService(svcSet, ing.Namespace, &path.Backend) + } + } + out.BackendServices = filterRefs(ctx, ac, svcSet.refs("Service", ""), "ingressSummary.backendServices", omitted) + + secretSet := newRefSet() + for _, tls := range ing.Spec.TLS { + secretSet.add(tls.SecretName, ing.Namespace) + } + out.TLSSecrets = filterRefs(ctx, ac, secretSet.refs("Secret", ""), "ingressSummary.tlsSecrets", omitted) + + if out.Class == "" && len(out.Addresses) == 0 && len(out.BackendServices) == 0 && len(out.TLSSecrets) == 0 && len(out.Warnings) == 0 { + return nil + } + return out +} + +func addIngressBackendService(dst *refSet, namespace string, backend *networkingv1.IngressBackend) { + if backend == nil || backend.Service == nil { + return + } + dst.add(backend.Service.Name, namespace) +} + +func buildNodeSummary(obj runtime.Object) *NodeSummary { + node, ok := obj.(*corev1.Node) + if !ok || node == nil { + return nil + } + out := &NodeSummary{ + Unschedulable: node.Spec.Unschedulable, + Capacity: compactResourceList(node.Status.Capacity), + Allocatable: compactResourceList(node.Status.Allocatable), + } + if out.Unschedulable { + out.Warnings = append(out.Warnings, NodeWarningUnschedulable) + } + for _, cond := range node.Status.Conditions { + switch cond.Type { + case corev1.NodeReady: + out.ReadyStatus = string(cond.Status) + if cond.Status != corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningNotReady) + } + case corev1.NodeDiskPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningDiskPressure) + } + case corev1.NodeMemoryPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningMemoryPressure) + } + case corev1.NodePIDPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningPIDPressure) + } + case corev1.NodeNetworkUnavailable: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningNetworkUnavailable) + } + } + } + for _, taint := range node.Spec.Taints { + out.Taints = append(out.Taints, TaintSummary{ + Key: taint.Key, + Value: taint.Value, + Effect: string(taint.Effect), + }) + } + return out +} + +func compactResourceList(resources corev1.ResourceList) map[string]string { + if len(resources) == 0 { + return nil + } + out := make(map[string]string, 4) + for _, name := range []corev1.ResourceName{ + corev1.ResourceCPU, + corev1.ResourceMemory, + corev1.ResourcePods, + corev1.ResourceEphemeralStorage, + } { + if qty, ok := resources[name]; ok { + out[string(name)] = qty.String() + } + } + if len(out) == 0 { + return nil + } + return out +} + +func buildPVCSummary(obj runtime.Object) *PVCSummary { + pvc, ok := obj.(*corev1.PersistentVolumeClaim) + if !ok || pvc == nil { + return nil + } + out := &PVCSummary{ + Phase: string(pvc.Status.Phase), + StorageClassName: valueOrEmpty(pvc.Spec.StorageClassName), + VolumeName: pvc.Spec.VolumeName, + VolumeMode: string(valueOrZero(pvc.Spec.VolumeMode)), + } + if req, ok := pvc.Spec.Resources.Requests[corev1.ResourceStorage]; ok { + out.RequestedStorage = req.String() + } + if cap, ok := pvc.Status.Capacity[corev1.ResourceStorage]; ok { + out.CapacityStorage = cap.String() + } + for _, mode := range pvc.Spec.AccessModes { + out.AccessModes = append(out.AccessModes, string(mode)) + } + if pvc.Annotations != nil { + out.Provisioner = pvc.Annotations["volume.kubernetes.io/storage-provisioner"] + out.SelectedNode = pvc.Annotations["volume.kubernetes.io/selected-node"] + out.BindCompleted = pvc.Annotations["pv.kubernetes.io/bind-completed"] + } + switch pvc.Status.Phase { + case corev1.ClaimPending: + out.Warnings = append(out.Warnings, PVCWarningPending) + case corev1.ClaimLost: + out.Warnings = append(out.Warnings, PVCWarningLost) + } + return out +} + +func buildJobSummary(obj runtime.Object) *JobSummary { + job, ok := obj.(*batchv1.Job) + if !ok || job == nil { + return nil + } + out := &JobSummary{ + Active: job.Status.Active, + Succeeded: job.Status.Succeeded, + Failed: job.Status.Failed, + Completions: int32OrDefault(job.Spec.Completions, 1), + Parallelism: int32OrDefault(job.Spec.Parallelism, 1), + BackoffLimit: int32OrDefault(job.Spec.BackoffLimit, 6), + Suspended: boolOrFalse(job.Spec.Suspend), + } + return out +} + +func buildCronJobSummary(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *CronJobSummary { + cj, ok := obj.(*batchv1.CronJob) + if !ok || cj == nil { + return nil + } + out := &CronJobSummary{ + Schedule: cj.Spec.Schedule, + Suspended: boolOrFalse(cj.Spec.Suspend), + } + if cj.Status.LastScheduleTime != nil { + out.LastScheduleTime = cj.Status.LastScheduleTime.Format("2006-01-02T15:04:05Z07:00") + } + if cj.Status.LastSuccessfulTime != nil { + out.LastSuccessfulTime = cj.Status.LastSuccessfulTime.Format("2006-01-02T15:04:05Z07:00") + } + active := make([]ContextRef, 0, len(cj.Status.Active)) + for _, ref := range cj.Status.Active { + if ref.Kind == "" || ref.Name == "" { + continue + } + active = append(active, ContextRef{ + Kind: ref.Kind, + Group: groupFromAPIVersion(ref.APIVersion), + Namespace: cj.Namespace, + Name: ref.Name, + }) + } + out.ActiveJobs = filterRefs(ctx, ac, active, "cronJobSummary.activeJobs", omitted) + return out +} + +func replicasOrZero(p *int32) int32 { + if p == nil { + return 0 + } + return *p +} + +func int32OrDefault(p *int32, fallback int32) int32 { + if p == nil { + return fallback + } + return *p +} + +func boolOrFalse(p *bool) bool { + return p != nil && *p +} + +func valueOrEmpty(p *string) string { + if p == nil { + return "" + } + return *p +} + +func valueOrZero[T ~string](p *T) T { + var zero T + if p == nil { + return zero + } + return *p +} + +func maxInt32(a, b int32) int32 { + if a > b { + return a + } + return b +} + +func buildStatusSummary(obj runtime.Object) *StatusSummary { + if obj == nil { + return nil + } + u, ok := objectToMap(obj) + if !ok { + return nil + } + status, ok, _ := unstructured.NestedMap(u, "status") + if !ok { + return nil + } + out := &StatusSummary{} + if phase, ok, _ := unstructured.NestedString(status, "phase"); ok { + out.Phase = phase + } + if conditions, ok, _ := unstructured.NestedSlice(status, "conditions"); ok { + if len(conditions) > maxSummaryItems { + conditions = conditions[:maxSummaryItems] + } + for _, item := range conditions { + cond, ok := item.(map[string]interface{}) + if !ok { + continue + } + summary := ConditionSummary{ + Type: stringField(cond, "type"), + Status: stringField(cond, "status"), + Reason: stringField(cond, "reason"), + Message: truncateRunes(stringField(cond, "message"), 300), + LastTransitionTime: stringField(cond, "lastTransitionTime"), + } + if summary.Type == "" && summary.Status == "" { + continue + } + out.Conditions = append(out.Conditions, summary) + } + } + if out.Phase == "" && len(out.Conditions) == 0 { + return nil + } + return out +} + +func objectToMap(obj runtime.Object) (map[string]interface{}, bool) { + if u, ok := obj.(*unstructured.Unstructured); ok { + return u.Object, true + } + out, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, false + } + return out, true +} + +func stringField(m map[string]interface{}, key string) string { + if v, ok := m[key].(string); ok { + return v + } + return "" +} + +func truncateRunes(s string, limit int) string { + if limit <= 0 || len(s) == 0 { + return "" + } + runes := []rune(s) + if len(runes) <= limit { + return s + } + return string(runes[:limit]) +} + // refSet collects (name, namespace) pairs with insertion-order preservation // for deterministic output. Names with empty namespaces are tolerated (the // PVC ClaimName can be cluster-scoped only in odd configurations, but we diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index c1fe2a5c0..0729bce78 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -7,10 +7,14 @@ import ( appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" "github.com/skyhook-io/radar/pkg/topology" ) @@ -45,6 +49,18 @@ func (m mockPolicyReports) FindingsFor(group, kind, namespace, name string) []Ky return m[kind+"/"+namespace+"/"+name] } +type mockServiceBackends []*corev1.Pod + +func (m mockServiceBackends) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + out := make([]*corev1.Pod, 0, len(m)) + for _, pod := range m { + if pod.Namespace == namespace && selector.Matches(labels.Set(pod.Labels)) { + out = append(out, pod) + } + } + return out, nil +} + // --------------------------------------------------------------------------- // Golden-file tests // --------------------------------------------------------------------------- @@ -180,6 +196,9 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { if rc.RunsOn == nil || rc.RunsOn.Name != "node-1" { t.Errorf("RunsOn: got %+v want Node/node-1", rc.RunsOn) } + if rc.Owner == nil || rc.Owner.Kind != "ReplicaSet" || rc.Owner.Name != "web-7d" || rc.Owner.Group != "apps" { + t.Errorf("Owner: got %+v want apps/ReplicaSet prod/web-7d", rc.Owner) + } // Uses: 2 ConfigMaps (web-config + shared-env), 2 Secrets (web-creds + api-key-secret), 1 PVC, ServiceAccount. if rc.Uses == nil { @@ -240,9 +259,112 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { } } +func TestBuild_Deployment_WorkloadSummaryAndTemplateUses(t *testing.T) { + replicas := int32(3) + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + ServiceAccountName: "api-sa", + Volumes: []corev1.Volume{{ + Name: "settings", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-settings"}, + }, + }, + }}, + Containers: []corev1.Container{{ + Name: "api", + Env: []corev1.EnvVar{{ + Name: "TOKEN", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-token"}, + Key: "token", + }, + }, + }}, + }}, + }, + }, + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 2, + AvailableReplicas: 2, + UpdatedReplicas: 3, + UnavailableReplicas: 1, + }, + } + + rc := Build(context.Background(), dep, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.WorkloadSummary == nil || rc.WorkloadSummary.Replicas == nil { + t.Fatalf("WorkloadSummary.Replicas: got nil; rc=%+v", rc) + } + rep := rc.WorkloadSummary.Replicas + if rep.Desired != 3 || rep.Ready != 2 || rep.Available != 2 || rep.Updated != 3 || rep.Unavailable != 1 { + t.Errorf("Replicas: got %+v", rep) + } + if rc.Uses == nil { + t.Fatal("Uses: got nil") + } + if got, want := len(rc.Uses.ConfigMaps), 1; got != want { + t.Errorf("Uses.ConfigMaps len: got %d want %d (%+v)", got, want, rc.Uses.ConfigMaps) + } + if got, want := len(rc.Uses.Secrets), 1; got != want { + t.Errorf("Uses.Secrets len: got %d want %d (%+v)", got, want, rc.Uses.Secrets) + } + if rc.Uses.ServiceAccount == nil || rc.Uses.ServiceAccount.Name != "api-sa" { + t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) + } +} + +func TestBuild_Pod_PodSummary(t *testing.T) { + pod := readyPod("api-1", "prod", map[string]string{"app": "api"}, true) + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: "api", + Ready: false, + RestartCount: 2, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}, + }, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "Error"}, + }, + }, + { + Name: "sidecar", + Ready: true, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }, + } + + rc := Build(context.Background(), pod, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.PodSummary == nil { + t.Fatal("PodSummary: got nil") + } + if rc.PodSummary.Phase != "Running" || !rc.PodSummary.Ready || rc.PodSummary.RestartCount != 2 { + t.Errorf("PodSummary: got %+v", rc.PodSummary) + } + if got, want := len(rc.PodSummary.Containers), 2; got != want { + t.Fatalf("Containers len: got %d want %d", got, want) + } + c := rc.PodSummary.Containers[0] + if c.State != "waiting" || c.Reason != "CrashLoopBackOff" || c.LastTerminationReason != "Error" { + t.Errorf("Container[0]: got %+v", c) + } +} + func TestBuild_Service_ExposedByIngress(t *testing.T) { svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{"app": "api"}, + }, } topo := &topology.Topology{ Nodes: []topology.Node{ @@ -257,6 +379,11 @@ func TestBuild_Service_ExposedByIngress(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, + ServiceBackends: mockServiceBackends{ + readyPod("api-1", "prod", map[string]string{"app": "api"}, true), + readyPod("api-2", "prod", map[string]string{"app": "api"}, false), + readyPod("other", "prod", map[string]string{"app": "other"}, true), + }, }) if got, want := len(rc.Exposes), 1; got != want { @@ -269,6 +396,219 @@ func TestBuild_Service_ExposedByIngress(t *testing.T) { if rc.Uses != nil { t.Errorf("Uses should be nil for Service: got %+v", rc.Uses) } + if rc.ServiceSummary == nil || rc.ServiceSummary.SelectedPods == nil { + t.Fatalf("ServiceSummary.SelectedPods: got nil; rc=%+v", rc) + } + if got, want := rc.ServiceSummary.SelectedPods.Total, 2; got != want { + t.Errorf("SelectedPods.Total: got %d want %d", got, want) + } + if got, want := rc.ServiceSummary.SelectedPods.Ready, 1; got != want { + t.Errorf("SelectedPods.Ready: got %d want %d", got, want) + } + if got, want := rc.ServiceSummary.SelectedPods.NotReady, 1; got != want { + t.Errorf("SelectedPods.NotReady: got %d want %d", got, want) + } + if len(rc.ServiceSummary.Warnings) != 0 { + t.Errorf("ServiceSummary.Warnings: got %+v want none", rc.ServiceSummary.Warnings) + } +} + +func TestBuild_Service_NoReadyPodsWarning(t *testing.T) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{"app": "api"}, + }, + } + rc := Build(context.Background(), svc, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + ServiceBackends: mockServiceBackends{ + readyPod("api-1", "prod", map[string]string{"app": "api"}, false), + }, + }) + if rc.ServiceSummary == nil { + t.Fatal("ServiceSummary: got nil") + } + if got := rc.ServiceSummary.Warnings; len(got) != 1 || got[0] != ServiceWarningNoReadyPods { + t.Fatalf("Warnings: got %+v want [%s]", got, ServiceWarningNoReadyPods) + } +} + +func TestBuild_IngressSummary_BackendsTLSAndWarnings(t *testing.T) { + className := "nginx" + ing := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod"}, + Spec: networkingv1.IngressSpec{ + IngressClassName: &className, + DefaultBackend: &networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{Name: "fallback"}, + }, + Rules: []networkingv1.IngressRule{{ + Host: "example.com", + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{Paths: []networkingv1.HTTPIngressPath{{ + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{Name: "web"}, + }, + }}}, + }, + }}, + TLS: []networkingv1.IngressTLS{{SecretName: "web-tls"}}, + }, + Status: networkingv1.IngressStatus{LoadBalancer: networkingv1.IngressLoadBalancerStatus{ + Ingress: []networkingv1.IngressLoadBalancerIngress{{Hostname: "lb.example.com"}}, + }}, + } + + rc := Build(context.Background(), ing, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.IngressSummary == nil { + t.Fatal("IngressSummary: got nil") + } + if rc.IngressSummary.Class != "nginx" { + t.Errorf("Class: got %q", rc.IngressSummary.Class) + } + if got, want := len(rc.IngressSummary.BackendServices), 2; got != want { + t.Fatalf("BackendServices len: got %d want %d (%+v)", got, want, rc.IngressSummary.BackendServices) + } + if got, want := len(rc.IngressSummary.TLSSecrets), 1; got != want { + t.Fatalf("TLSSecrets len: got %d want %d", got, want) + } + if len(rc.IngressSummary.Warnings) != 0 { + t.Errorf("Warnings: got %+v want none", rc.IngressSummary.Warnings) + } +} + +func TestBuild_NodeSummary(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Spec: corev1.NodeSpec{ + Unschedulable: true, + Taints: []corev1.Taint{{ + Key: "dedicated", + Value: "batch", + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourcePods: resource.MustParse("110"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("3900m"), + corev1.ResourceMemory: resource.MustParse("14Gi"), + }, + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionFalse}, + {Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue}, + }, + }, + } + + rc := Build(context.Background(), node, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.NodeSummary == nil { + t.Fatal("NodeSummary: got nil") + } + if rc.NodeSummary.ReadyStatus != "False" || !rc.NodeSummary.Unschedulable { + t.Errorf("NodeSummary: got %+v", rc.NodeSummary) + } + if rc.NodeSummary.Capacity["cpu"] != "4" || rc.NodeSummary.Allocatable["memory"] != "14Gi" { + t.Errorf("Capacity/Allocatable: got %+v / %+v", rc.NodeSummary.Capacity, rc.NodeSummary.Allocatable) + } + if got, want := len(rc.NodeSummary.Taints), 1; got != want { + t.Fatalf("Taints len: got %d want %d", got, want) + } + if got := rc.NodeSummary.Warnings; len(got) != 3 { + t.Errorf("Warnings: got %+v want unschedulable/not_ready/memory_pressure", got) + } +} + +func TestBuild_PVCSummary(t *testing.T) { + storageClass := "standard" + volumeMode := corev1.PersistentVolumeFilesystem + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "data", + Namespace: "prod", + Annotations: map[string]string{ + "volume.kubernetes.io/storage-provisioner": "pd.csi.storage.gke.io", + "volume.kubernetes.io/selected-node": "node-1", + "pv.kubernetes.io/bind-completed": "yes", + }, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: &storageClass, + VolumeName: "pv-data", + VolumeMode: &volumeMode, + AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("10Gi")}, + }, + }, + Status: corev1.PersistentVolumeClaimStatus{ + Phase: corev1.ClaimPending, + Capacity: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("8Gi")}, + }, + } + + rc := Build(context.Background(), pvc, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.PVCSummary == nil { + t.Fatal("PVCSummary: got nil") + } + if rc.PVCSummary.Phase != "Pending" || rc.PVCSummary.RequestedStorage != "10Gi" || rc.PVCSummary.CapacityStorage != "8Gi" { + t.Errorf("PVCSummary: got %+v", rc.PVCSummary) + } + if got := rc.PVCSummary.Warnings; len(got) != 1 || got[0] != PVCWarningPending { + t.Errorf("Warnings: got %+v", got) + } +} + +func TestBuild_JobAndCronJobSummary(t *testing.T) { + completions := int32(5) + parallelism := int32(2) + backoff := int32(3) + suspend := true + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod"}, + Spec: batchv1.JobSpec{ + Completions: &completions, + Parallelism: ¶llelism, + BackoffLimit: &backoff, + Suspend: &suspend, + }, + Status: batchv1.JobStatus{Active: 1, Succeeded: 2, Failed: 1}, + } + rc := Build(context.Background(), job, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.JobSummary == nil { + t.Fatal("JobSummary: got nil") + } + if rc.JobSummary.Completions != 5 || rc.JobSummary.Parallelism != 2 || rc.JobSummary.BackoffLimit != 3 || !rc.JobSummary.Suspended { + t.Errorf("JobSummary: got %+v", rc.JobSummary) + } + + cj := &batchv1.CronJob{ + ObjectMeta: metav1.ObjectMeta{Name: "nightly", Namespace: "prod"}, + Spec: batchv1.CronJobSpec{Schedule: "0 0 * * *", Suspend: &suspend}, + Status: batchv1.CronJobStatus{ + Active: []corev1.ObjectReference{{ + APIVersion: "batch/v1", + Kind: "Job", + Name: "nightly-1", + }}, + }, + } + rc = Build(context.Background(), cj, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.CronJobSummary == nil { + t.Fatal("CronJobSummary: got nil") + } + if rc.CronJobSummary.Schedule != "0 0 * * *" || !rc.CronJobSummary.Suspended { + t.Errorf("CronJobSummary: got %+v", rc.CronJobSummary) + } + if got, want := len(rc.CronJobSummary.ActiveJobs), 1; got != want { + t.Fatalf("ActiveJobs len: got %d want %d", got, want) + } } func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { @@ -495,12 +835,70 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { } } +func TestBuild_Unstructured_StatusSummary(t *testing.T) { + obj := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "example.com/v1", + "kind": "Widget", + "metadata": map[string]interface{}{ + "name": "w1", + "namespace": "prod", + }, + "status": map[string]interface{}{ + "phase": "Reconciling", + "conditions": []interface{}{ + map[string]interface{}{ + "type": "Ready", + "status": "False", + "reason": "DependencyMissing", + "message": "waiting for dependency", + "lastTransitionTime": "2026-05-21T10:00:00Z", + }, + }, + }, + }} + + rc := Build(context.Background(), obj, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.StatusSummary == nil { + t.Fatal("StatusSummary: got nil") + } + if rc.StatusSummary.Phase != "Reconciling" { + t.Errorf("Phase: got %q", rc.StatusSummary.Phase) + } + if got, want := len(rc.StatusSummary.Conditions), 1; got != want { + t.Fatalf("Conditions len: got %d want %d", got, want) + } + cond := rc.StatusSummary.Conditions[0] + if cond.Type != "Ready" || cond.Status != "False" || cond.Reason != "DependencyMissing" { + t.Errorf("Condition: got %+v", cond) + } +} + // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- func ptrBool(b bool) *bool { return &b } +func readyPod(name, namespace string, podLabels map[string]string, ready bool) *corev1.Pod { + status := corev1.ConditionFalse + if ready { + status = corev1.ConditionTrue + } + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: podLabels, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: status, + }}, + }, + } +} + func contains(s, sub string) bool { return len(s) >= len(sub) && indexOf(s, sub) >= 0 } diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 235ed2749..834c0508d 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -28,17 +28,27 @@ package resourcecontext // emerges that needs deterministic prose, add it as a separate // `explain_resource` tool rather than re-introducing it inline here. type ResourceContext struct { - Tier ContextTier `json:"tier"` - ManagedBy []ContextRef `json:"managedBy,omitempty"` - Exposes []ContextRef `json:"exposes,omitempty"` - SelectedBy []ContextRef `json:"selectedBy,omitempty"` - Uses *UsesBlock `json:"uses,omitempty"` - RunsOn *ContextRef `json:"runsOn,omitempty"` - ScaledBy []ContextRef `json:"scaledBy,omitempty"` - IssueSummary *IssueSummary `json:"issueSummary,omitempty"` - AuditSummary *AuditSummary `json:"auditSummary,omitempty"` - PolicySummary *PolicySummary `json:"policySummary,omitempty"` - Omitted []OmittedField `json:"omitted,omitempty"` + Tier ContextTier `json:"tier"` + Owner *ContextRef `json:"owner,omitempty"` + ManagedBy []ContextRef `json:"managedBy,omitempty"` + Exposes []ContextRef `json:"exposes,omitempty"` + SelectedBy []ContextRef `json:"selectedBy,omitempty"` + Uses *UsesBlock `json:"uses,omitempty"` + RunsOn *ContextRef `json:"runsOn,omitempty"` + ScaledBy []ContextRef `json:"scaledBy,omitempty"` + StatusSummary *StatusSummary `json:"statusSummary,omitempty"` + PodSummary *PodSummary `json:"podSummary,omitempty"` + WorkloadSummary *WorkloadSummary `json:"workloadSummary,omitempty"` + ServiceSummary *ServiceSummary `json:"serviceSummary,omitempty"` + IngressSummary *IngressSummary `json:"ingressSummary,omitempty"` + NodeSummary *NodeSummary `json:"nodeSummary,omitempty"` + PVCSummary *PVCSummary `json:"pvcSummary,omitempty"` + JobSummary *JobSummary `json:"jobSummary,omitempty"` + CronJobSummary *CronJobSummary `json:"cronJobSummary,omitempty"` + IssueSummary *IssueSummary `json:"issueSummary,omitempty"` + AuditSummary *AuditSummary `json:"auditSummary,omitempty"` + PolicySummary *PolicySummary `json:"policySummary,omitempty"` + Omitted []OmittedField `json:"omitted,omitempty"` } // ContextTier signals how much enrichment is included. "basic" is the @@ -95,6 +105,157 @@ type UsesBlock struct { PVCs []ContextRef `json:"pvcs,omitempty"` } +// StatusSummary is the generic, deterministic status projection used for +// built-ins and CRDs. It intentionally carries raw condition facts rather than +// prose conclusions. +type StatusSummary struct { + Phase string `json:"phase,omitempty"` + Conditions []ConditionSummary `json:"conditions,omitempty"` +} + +type ConditionSummary struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason,omitempty"` + Message string `json:"message,omitempty"` + LastTransitionTime string `json:"lastTransitionTime,omitempty"` +} + +type PodSummary struct { + Phase string `json:"phase,omitempty"` + Ready bool `json:"ready"` + RestartCount int32 `json:"restartCount,omitempty"` + Containers []ContainerStateSummary `json:"containers,omitempty"` +} + +type ContainerStateSummary struct { + Name string `json:"name"` + Ready bool `json:"ready"` + RestartCount int32 `json:"restartCount,omitempty"` + State string `json:"state,omitempty"` + Reason string `json:"reason,omitempty"` + LastTerminationReason string `json:"lastTerminationReason,omitempty"` +} + +type WorkloadSummary struct { + Replicas *ReplicaSummary `json:"replicas,omitempty"` + Conditions []ConditionSummary `json:"conditions,omitempty"` +} + +type ReplicaSummary struct { + Desired int32 `json:"desired,omitempty"` + Ready int32 `json:"ready,omitempty"` + Available int32 `json:"available,omitempty"` + Updated int32 `json:"updated,omitempty"` + Unavailable int32 `json:"unavailable,omitempty"` +} + +// ServiceSummary adds realized backend state for a Service. The raw Service +// spec already contains type/ports/selector; this block focuses on facts that +// require looking at related resources. +type ServiceSummary struct { + SelectedPods *PodSelectionSummary `json:"selectedPods,omitempty"` + Warnings []ServiceWarning `json:"warnings,omitempty"` +} + +type PodSelectionSummary struct { + Total int `json:"total"` + Ready int `json:"ready"` + NotReady int `json:"notReady,omitempty"` + ReadyPods []ContextRef `json:"readyPods,omitempty"` + NotReadyPods []ContextRef `json:"notReadyPods,omitempty"` + Truncated bool `json:"truncated,omitempty"` +} + +type ServiceWarning string + +const ( + ServiceWarningNoSelector ServiceWarning = "no_selector" + ServiceWarningNoSelectedPods ServiceWarning = "no_selected_pods" + ServiceWarningNoReadyPods ServiceWarning = "no_ready_pods" +) + +type IngressSummary struct { + Class string `json:"class,omitempty"` + Addresses []string `json:"addresses,omitempty"` + BackendServices []ContextRef `json:"backendServices,omitempty"` + TLSSecrets []ContextRef `json:"tlsSecrets,omitempty"` + Warnings []IngressWarning `json:"warnings,omitempty"` +} + +type IngressWarning string + +const ( + IngressWarningNoAddress IngressWarning = "no_address" + IngressWarningNoClass IngressWarning = "no_class" + IngressWarningNoRules IngressWarning = "no_rules" +) + +type NodeSummary struct { + ReadyStatus string `json:"readyStatus,omitempty"` + Unschedulable bool `json:"unschedulable,omitempty"` + Capacity map[string]string `json:"capacity,omitempty"` + Allocatable map[string]string `json:"allocatable,omitempty"` + Taints []TaintSummary `json:"taints,omitempty"` + Warnings []NodeWarning `json:"warnings,omitempty"` +} + +type TaintSummary struct { + Key string `json:"key"` + Value string `json:"value,omitempty"` + Effect string `json:"effect"` +} + +type NodeWarning string + +const ( + NodeWarningUnschedulable NodeWarning = "unschedulable" + NodeWarningNotReady NodeWarning = "not_ready" + NodeWarningDiskPressure NodeWarning = "disk_pressure" + NodeWarningMemoryPressure NodeWarning = "memory_pressure" + NodeWarningPIDPressure NodeWarning = "pid_pressure" + NodeWarningNetworkUnavailable NodeWarning = "network_unavailable" +) + +type PVCSummary struct { + Phase string `json:"phase,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + VolumeName string `json:"volumeName,omitempty"` + RequestedStorage string `json:"requestedStorage,omitempty"` + CapacityStorage string `json:"capacityStorage,omitempty"` + AccessModes []string `json:"accessModes,omitempty"` + VolumeMode string `json:"volumeMode,omitempty"` + Provisioner string `json:"provisioner,omitempty"` + SelectedNode string `json:"selectedNode,omitempty"` + BindCompleted string `json:"bindCompleted,omitempty"` + Warnings []PVCWarning `json:"warnings,omitempty"` +} + +type PVCWarning string + +const ( + PVCWarningPending PVCWarning = "pending" + PVCWarningLost PVCWarning = "lost" +) + +type JobSummary struct { + Active int32 `json:"active,omitempty"` + Succeeded int32 `json:"succeeded,omitempty"` + Failed int32 `json:"failed,omitempty"` + Completions int32 `json:"completions,omitempty"` + Parallelism int32 `json:"parallelism,omitempty"` + BackoffLimit int32 `json:"backoffLimit,omitempty"` + Suspended bool `json:"suspended,omitempty"` +} + +type CronJobSummary struct { + Schedule string `json:"schedule,omitempty"` + Suspended bool `json:"suspended,omitempty"` + ActiveJobs []ContextRef `json:"activeJobs,omitempty"` + LastScheduleTime string `json:"lastScheduleTime,omitempty"` + LastSuccessfulTime string `json:"lastSuccessfulTime,omitempty"` +} + // IssueSummary is a rollup of internal issue-engine findings scoped to // the subject resource. Pre-computed by callers and passed into the // generator — this package does not import internal/issues.