From e76e4fe94e4d84167e4bb3b4d8de8c7b05dd9447 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 03:30:15 +0300 Subject: [PATCH 01/17] feat(resourcecontext): add Build generator + wire /api/ai/resources GET (T6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the basic-tier resourceContext builder and wires it into the /api/ai/resources/{kind}/{ns}/{name} GET handler. The package layer is pure-Go and depends only on pkg/topology — callers in internal/* pre- compute IssueSummary and AuditSummary so this package doesn't reach into internal/issues or internal/audit. What ships: - pkg/resourcecontext/build.go: Build(ctx, obj, opts) walks topology relationships + pod spec to populate ManagedBy (Argo tracking-id / Flux labels / owner ref), Exposes, SelectedBy (PDB + NetworkPolicy split), Uses (ConfigMaps/Secrets/PVCs/SA), RunsOn, ScaledBy, and PolicySummary. Every ContextRef passes through opts.AccessChecker; denials are dropped and recorded in Omitted with rbac_denied reason. - pkg/resourcecontext/hints.go: SynthesizeHints renders 5–8 short, deterministic prose lines for AI consumers. EmitHints=false on UI callers leaves Hints empty. - internal/server/rc_rbac.go: requestScopedChecker wraps Server.canRead with a per-request (verb,group,kind,namespace) memoization layer. Collapses the ~30 ref-checks per response into ~5 SAR calls without changing the underlying PermissionCache TTL. - internal/server/ai_handlers.go: handleAIGetResource now returns { resource, resourceContext } by default. ?context=none keeps the pre-T6 bare shape. IssueSummary uses issues.ComposeWithStats scoped to the resource; AuditSummary uses pkg/audit.IndexByResource. handleAIListResources is unchanged — T8/T9 territory. Golden tests cover Pod (full enrichment + RBAC denial + JSON shape), Deployment (Flux HelmRelease label precedence), Service (Ingress Exposes), NetworkPolicy (outgoing EdgeProtects intentionally not surfaced), ConfigMap (owner-chain only), PolicyReports rollup, HPA identity, and the EmitHints=false path. Hints determinism is pinned by a separate golden in hints_test.go. --- internal/server/ai_handlers.go | 294 ++++++++++-- internal/server/rc_rbac.go | 88 ++++ pkg/resourcecontext/build.go | 722 ++++++++++++++++++++++++++++++ pkg/resourcecontext/build_test.go | 590 ++++++++++++++++++++++++ pkg/resourcecontext/hints.go | 232 ++++++++++ pkg/resourcecontext/hints_test.go | 118 +++++ 6 files changed, 2013 insertions(+), 31 deletions(-) create mode 100644 internal/server/rc_rbac.go create mode 100644 pkg/resourcecontext/build.go create mode 100644 pkg/resourcecontext/build_test.go create mode 100644 pkg/resourcecontext/hints.go create mode 100644 pkg/resourcecontext/hints_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ea130bac..4db391ef2 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -1,15 +1,22 @@ package server import ( + "context" "fmt" "net/http" "strings" "github.com/go-chi/chi/v5" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/audit" + "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" ) // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. @@ -110,8 +117,23 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 s.writeJSON(w, results) } -// handleAIGetResource returns a single minified resource for AI consumption. -// GET /api/ai/resources/{kind}/{namespace}/{name}?group=X&verbosity=summary|detail|compact +// handleAIGetResource returns a single minified resource for AI consumption, +// wrapped with a resourceContext enrichment block by default. +// +// GET /api/ai/resources/{kind}/{namespace}/{name} +// +// Query params: +// - group=X API group disambiguator for CRDs. +// - verbosity=... summary | detail | compact (default: detail). +// - context=none Skip resourceContext build, return bare minified resource. +// +// Response shape (default): +// +// { "resource": , "resourceContext": { ...basic tier... } } +// +// Response shape (context=none): +// +// func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return @@ -121,6 +143,7 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") group := r.URL.Query().Get("group") level := parseVerbosity(r, aicontext.LevelDetail) + skipContext := r.URL.Query().Get("context") == "none" // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace if namespace == "_" { @@ -133,41 +156,250 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { return } - // Try typed cache first - obj, err := k8s.FetchResource(cache, kind, namespace, name) - if err == k8s.ErrUnknownKind { - // Fall through to dynamic cache for CRDs - u, dynErr := cache.GetDynamicWithGroup(r.Context(), kind, namespace, name, group) - if dynErr != nil { - if strings.Contains(dynErr.Error(), "unknown resource kind") { - s.writeError(w, http.StatusBadRequest, dynErr.Error()) - return - } - if strings.Contains(dynErr.Error(), "not found") { - s.writeError(w, http.StatusNotFound, dynErr.Error()) - return - } - s.writeError(w, http.StatusInternalServerError, dynErr.Error()) - return - } - s.writeJSON(w, aicontext.MinifyUnstructured(u, level)) - return - } + obj, isUnstructured, err := s.fetchAIResource(r.Context(), cache, kind, namespace, name, group) if err != nil { - if strings.HasPrefix(err.Error(), "forbidden:") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) - return - } - s.writeError(w, http.StatusNotFound, err.Error()) + s.writeAIFetchError(w, kind, err) return } - k8s.SetTypeMeta(obj) - result, err := aicontext.Minify(obj, level) + if !isUnstructured { + k8s.SetTypeMeta(obj) + } + + minified, err := minifyForAI(obj, isUnstructured, level) if err != nil { s.writeError(w, http.StatusInternalServerError, err.Error()) return } - s.writeJSON(w, result) + if skipContext { + s.writeJSON(w, minified) + return + } + + rc := s.buildAIResourceContext(r, obj, kind, namespace, name) + s.writeJSON(w, map[string]any{ + "resource": minified, + "resourceContext": rc, + }) +} + +// fetchAIResource resolves the resource from the typed cache or dynamic cache. +// The bool reports whether the returned object is an unstructured (CRD) value. +func (s *Server) fetchAIResource(ctx context.Context, cache *k8s.ResourceCache, kind, namespace, name, group string) (runtime.Object, bool, error) { + obj, err := k8s.FetchResource(cache, kind, namespace, name) + if err == nil { + return obj, false, nil + } + if err != k8s.ErrUnknownKind { + return nil, false, err + } + u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if dynErr != nil { + return nil, false, dynErr + } + return u, true, nil +} + +// writeAIFetchError maps fetch errors to HTTP status codes. Mirrors the +// previous inline behavior so consumers don't see a status-code drift. +func (s *Server) writeAIFetchError(w http.ResponseWriter, kind string, err error) { + msg := err.Error() + switch { + case strings.HasPrefix(msg, "forbidden:"): + s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) + case strings.Contains(msg, "unknown resource kind"): + s.writeError(w, http.StatusBadRequest, msg) + case strings.Contains(msg, "not found"): + s.writeError(w, http.StatusNotFound, msg) + default: + s.writeError(w, http.StatusNotFound, msg) + } +} + +// minifyForAI dispatches to the right Minify variant based on whether the +// resource is unstructured (CRD) or typed. +func minifyForAI(obj runtime.Object, isUnstructured bool, level aicontext.VerbosityLevel) (any, error) { + if isUnstructured { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, fmt.Errorf("internal: object marked unstructured but is %T", obj) + } + return aicontext.MinifyUnstructured(u, level), nil + } + return aicontext.Minify(obj, level) +} + +// buildAIResourceContext assembles the Options struct and calls Build. +// Returns the populated context — never nil unless obj is nil. +func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kind, namespace, name string) *resourcecontext.ResourceContext { + if obj == nil { + return nil + } + cache := k8s.GetResourceCache() + + issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) + auditSum := computeAuditSummaryForResource(cache, namespace, name) + + opts := resourcecontext.Options{ + Tier: resourcecontext.TierBasic, + AccessChecker: s.newRequestScopedChecker(r), + EmitHints: true, + IssueSummary: issueSum, + AuditSummary: auditSum, + } + + if topo, prov, dyn, ok := s.topologyForContext(namespace); ok { + opts.Topology = topo + opts.Provider = prov + opts.DynamicProv = dyn + } + + return resourcecontext.Build(r.Context(), obj, opts) +} + +// topologyForContext builds (or fetches the memoized) topology scoped to the +// resource's namespace. Cluster-scoped resources get an all-namespaces build. +// Returns ok=false when the cache isn't ready yet. +func (s *Server) topologyForContext(namespace string) (*topology.Topology, topology.ResourceProvider, topology.DynamicProvider, bool) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, nil, false + } + opts := topology.DefaultBuildOptions() + if namespace != "" { + opts.Namespaces = []string{namespace} + } + opts.IncludeReplicaSets = true + opts.ForRelationshipCache = true + + provider := k8s.NewTopologyResourceProvider(cache) + dyn := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + topo, err := s.topoMemo.Get(opts, func() (*topology.Topology, error) { + return topology.NewBuilder(provider).WithDynamic(dyn).Build(opts) + }) + if err != nil || topo == nil { + return nil, nil, nil, false + } + return topo, provider, dyn, true +} + +// computeIssueSummaryForResource rolls up per-resource issue-composer rows +// (problem + condition + optional audit) into an IssueSummary. +// +// The composer is the canonical "what's wrong with this resource" surface — +// it merges problem detection (Deployment/DS/etc.), pod-level conditions, +// and generic CRD condition fallback. Filtering to a single (kind, name) +// is done client-side; the composer's native namespace filter restricts the +// scan to the resource's namespace so we don't walk the whole cluster. +// +// Returns nil when no issues match — Build then omits the IssueSummary field. +func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { + if cache == nil { + return nil + } + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + filters := issues.Filters{ + Kinds: []string{kind}, + Limit: issues.MaxLimit, + } + if namespace != "" { + filters.Namespaces = []string{namespace} + } + rows, _ := issues.ComposeWithStats(provider, filters) + + var count int + var topReason string + var topSeverity issues.Severity + bySource := make(map[string]int) + for _, row := range rows { + if row.Name != name { + continue + } + if namespace != "" && row.Namespace != namespace { + continue + } + count++ + bySource[string(row.Source)]++ + if topSeverity == "" || composeSeverityRank(row.Severity) > composeSeverityRank(topSeverity) { + topSeverity = row.Severity + topReason = row.Reason + } + } + if count == 0 { + return nil + } + return &resourcecontext.IssueSummary{ + Count: count, + HighestSeverity: string(topSeverity), + TopReason: topReason, + BySource: bySource, + } +} + +// composeSeverityRank orders issues.Severity for highest-wins rollup. +func composeSeverityRank(s issues.Severity) int { + switch s { + case issues.SeverityCritical: + return 2 + case issues.SeverityWarning: + return 1 + } + return 0 +} + +// computeAuditSummaryForResource looks up audit findings for the subject +// resource. Uses pkg/audit.IndexByResource so the lookup is keyed on the +// canonical (Kind/ns/name) tuple — handles plural→singular normalization +// via the Finding.Kind values written by the check runner. +func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil { + return nil + } + results := audit.RunFromCache(cache, []string{namespace}, nil) + if results == nil || len(results.Findings) == 0 { + return nil + } + idx := bpaudit.IndexByResource(results.Findings) + var match []bpaudit.Finding + for key, fs := range idx { + parts := strings.SplitN(key, "/", 3) + if len(parts) != 3 { + continue + } + if parts[1] == namespace && parts[2] == name { + match = append(match, fs...) + } + } + if len(match) == 0 { + return nil + } + + var topSeverity, topFinding string + for _, f := range match { + if topSeverity == "" || auditSeverityRank(f.Severity) > auditSeverityRank(topSeverity) { + topSeverity = f.Severity + topFinding = f.CheckID + } + } + return &resourcecontext.AuditSummary{ + Count: len(match), + HighestSeverity: topSeverity, + TopFinding: topFinding, + } +} + +// auditSeverityRank orders audit finding severities ("danger" > "warning"). +func auditSeverityRank(s string) int { + switch s { + case bpaudit.SeverityDanger: + return 2 + case bpaudit.SeverityWarning: + return 1 + } + return 0 } diff --git a/internal/server/rc_rbac.go b/internal/server/rc_rbac.go new file mode 100644 index 000000000..13c808055 --- /dev/null +++ b/internal/server/rc_rbac.go @@ -0,0 +1,88 @@ +package server + +import ( + "context" + "net/http" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// requestScopedChecker adapts Server.canRead into resourcecontext.RefAccessChecker +// with a request-local memoization layer keyed on (verb, group, kind, namespace). +// +// A single resourceContext build emits ~30 candidate refs but only ~5 distinct +// (group, kind, namespace) tuples — most workloads point at ConfigMaps and +// Secrets in their own namespace, plus a ServiceAccount and a Node. Caching +// here collapses the SAR fan-out before reaching s.canRead's per-user cache. +// +// The map is intentionally request-scoped (not server-scoped): server-scoped +// caching is already in pkg/auth.PermissionCache (2-min TTL) and reused via +// s.canRead. The per-request layer exists only to deduplicate the burst this +// builder generates within a single response. +type requestScopedChecker struct { + s *Server + req *http.Request + cache map[string]bool +} + +// newRequestScopedChecker returns a checker scoped to a single HTTP request. +// Not safe for concurrent use across requests; each handler invocation MUST +// construct its own checker. +func (s *Server) newRequestScopedChecker(r *http.Request) *requestScopedChecker { + return &requestScopedChecker{ + s: s, + req: r, + cache: make(map[string]bool, 8), + } +} + +// CanRead implements resourcecontext.RefAccessChecker. +// +// Authorization rules: +// - Namespaced kinds: SAR on (verb=get, group, resource, namespace). +// - Cluster-scoped kinds (namespace == ""): SAR on (verb=get, group, resource, ""). +// - Unknown kinds (not in discovery, not in static catalogue) pass through — +// mirrors the rest of the codebase's unknown-kind passthrough semantics. +// This is safe because Build only emits refs whose kinds are known to the +// topology builder (which itself uses discovery); a kind unknown here is a +// temporary discovery-cold state, not a permission bypass vector. +func (c *requestScopedChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + key := "get|" + group + "|" + kind + "|" + namespace + if v, ok := c.cache[key]; ok { + return v + } + + resource := lookupResourceName(kind, group) + if resource == "" { + // Unknown kind — passthrough. See doc comment for rationale. + c.cache[key] = true + return true + } + + allowed := c.s.canRead(c.req, group, resource, namespace, "get") + c.cache[key] = allowed + return allowed +} + +// Compile-time assertion that requestScopedChecker satisfies the contract. +var _ resourcecontext.RefAccessChecker = (*requestScopedChecker)(nil) + +// lookupResourceName resolves a (kind, group) pair to the canonical plural +// resource name used by SubjectAccessReview. Tries the static cluster-only +// catalogue (covers Nodes / ClusterRoles / etc.), then discovery for everything +// else including CRDs. Returns "" when neither path knows the kind. +func lookupResourceName(kind, group string) string { + if kind == "" { + return "" + } + if g, r, ok := k8s.ClusterOnlyKindGVR(kind); ok && (group == "" || group == g) { + return r + } + if disc := k8s.GetResourceDiscovery(); disc != nil { + if ar, ok := disc.GetResourceWithGroup(kind, group); ok { + return ar.Name + } + } + return "" +} diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go new file mode 100644 index 000000000..c3d7374d4 --- /dev/null +++ b/pkg/resourcecontext/build.go @@ -0,0 +1,722 @@ +package resourcecontext + +import ( + "context" + "sort" + "strings" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + rbacv1 "k8s.io/api/rbac/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// Options carries everything Build needs to compute a ResourceContext. +// +// Per the v1 contract, this package depends only on pkg/topology — callers +// in internal/* pre-compute IssueSummary / AuditSummary / PolicyReports and +// pass them in, so we don't reach into internal/issues or internal/audit. +type Options struct { + Tier ContextTier + MaxTokens int // reserved for future budgeting; not enforced in v1 + + // AccessChecker gates every emitted ContextRef. nil = no gating (treat + // as fully authorized — local-kubeconfig / tests). + AccessChecker RefAccessChecker + + // Topology data sources. When Topology is nil, the topology-derived + // fields (Exposes, SelectedBy, ScaledBy) are skipped. + Topology *topology.Topology + Provider topology.ResourceProvider + DynamicProv topology.DynamicProvider + + // Pre-computed summaries — pass-through into the response. + IssueSummary *IssueSummary + AuditSummary *AuditSummary + PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings + + // EmitHints controls whether SynthesizeHints runs over the structured + // fields. AI-facing callers (MCP, /api/ai/*) set true; UI callers false. + EmitHints bool +} + +// PolicyReportLookup is the minimal interface Build needs from the +// PolicyReport index. The concrete index lives in pkg/policyreports. +// +// Build does not import pkg/policyreports directly because callers may +// adapt other policy engines into the same shape. +type PolicyReportLookup interface { + FindingsFor(kind, namespace, name string) []KyvernoFinding +} + +// RefAccessChecker abstracts the RBAC check so this package doesn't import +// any internal/* package. REST and MCP handlers each implement this with a +// request-scoped batch cache (see internal/server/rc_rbac.go). +// +// Implementations should treat (group, kind, namespace) as the cache key — +// per-name SAR has no upside since RBAC is namespace-granular. +type RefAccessChecker interface { + CanRead(ctx context.Context, group, kind, namespace string) bool +} + +// Build produces a ResourceContext for obj at the requested tier. +// +// Returns nil when obj is nil. Returns a zero-value (.Tier-only) +// ResourceContext when obj is recognized but no enrichment fields apply. +// Never panics on nil sub-fields of opts. +func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceContext { + if obj == nil { + return nil + } + + ident, ok := identityOf(obj) + if !ok { + return &ResourceContext{Tier: opts.Tier} + } + + rc := &ResourceContext{Tier: opts.Tier} + omitted := newOmittedTracker() + + // 1. ManagedBy — owner chain + GitOps labels/annotations + rc.ManagedBy = filterRefs( + ctx, opts.AccessChecker, + buildManagedBy(ident), + "managedBy", omitted, + ) + + // 2. Topology-derived: Exposes, SelectedBy, ScaledBy + var rel *topology.Relationships + if opts.Topology != nil { + rel = topology.GetRelationships(ident.Kind, ident.Namespace, ident.Name, opts.Topology, opts.Provider, opts.DynamicProv) + } + if rel != nil { + exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) + exposes = append(exposes, rel.Services...) + exposes = append(exposes, rel.Ingresses...) + exposes = append(exposes, rel.Gateways...) + exposes = append(exposes, rel.Routes...) + rc.Exposes = filterRefs(ctx, opts.AccessChecker, + toContextRefs(exposes, ReasonLabelSelector, SourceTopology), + "exposes", omitted) + + selected := make([]topology.ResourceRef, 0, len(rel.PDBs)+len(rel.NetworkPolicies)) + selected = append(selected, rel.PDBs...) + selected = append(selected, rel.NetworkPolicies...) + rc.SelectedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(selected, ReasonPodSelector, SourceTopology), + "selectedBy", omitted) + + rc.ScaledBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(rel.Scalers, ReasonScaleTargetRef, SourceTopology), + "scaledBy", omitted) + } + + // 3. Pod-specific: Uses + RunsOn + if pod, ok := obj.(*corev1.Pod); ok { + rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) + + if pod.Spec.NodeName != "" { + candidate := &ContextRef{ + Kind: "Node", + Name: pod.Spec.NodeName, + Reason: ReasonNodeName, + Source: SourceK8sSpec, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.RunsOn = candidate + } else { + omitted.add("runsOn", OmittedRBACDenied) + } + } + } + + // 4. Pre-computed summaries — pass-through. + rc.IssueSummary = opts.IssueSummary + rc.AuditSummary = opts.AuditSummary + + // 5. PolicyReports — Kyverno findings rolled up. + if opts.PolicyReports != nil { + findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) + if len(findings) > 0 { + rc.PolicySummary = buildPolicySummary(findings) + } + } + + // 6. Hints — AI-only. + if opts.EmitHints { + rc.Hints = SynthesizeHints(rc, opts.Tier) + } + + rc.Omitted = omitted.collect() + return rc +} + +// --------------------------------------------------------------------------- +// Identity extraction +// --------------------------------------------------------------------------- + +// resourceIdentity is the projection of obj that Build needs without holding +// on to the full runtime.Object. Owner refs and labels feed ManagedBy; the +// (Kind, Namespace, Name) tuple keys topology + summary lookups. +type resourceIdentity struct { + Kind string + Group string + Namespace string + Name string + Labels map[string]string + Annotations map[string]string + Owners []metav1.OwnerReference +} + +// identityOf extracts identity from a typed K8s object or unstructured. +// Returns (_, false) for unknown shapes so callers can short-circuit. +func identityOf(obj runtime.Object) (resourceIdentity, bool) { + if obj == nil { + return resourceIdentity{}, false + } + switch v := obj.(type) { + case *corev1.Pod: + return identFromMeta("Pod", "", &v.ObjectMeta), true + case *corev1.Service: + return identFromMeta("Service", "", &v.ObjectMeta), true + case *corev1.ConfigMap: + return identFromMeta("ConfigMap", "", &v.ObjectMeta), true + case *corev1.Secret: + return identFromMeta("Secret", "", &v.ObjectMeta), true + case *corev1.Node: + return identFromMeta("Node", "", &v.ObjectMeta), true + case *corev1.Namespace: + return identFromMeta("Namespace", "", &v.ObjectMeta), true + case *corev1.PersistentVolume: + return identFromMeta("PersistentVolume", "", &v.ObjectMeta), true + case *corev1.PersistentVolumeClaim: + return identFromMeta("PersistentVolumeClaim", "", &v.ObjectMeta), true + case *corev1.ServiceAccount: + return identFromMeta("ServiceAccount", "", &v.ObjectMeta), true + case *corev1.Event: + return identFromMeta("Event", "", &v.ObjectMeta), true + case *corev1.LimitRange: + return identFromMeta("LimitRange", "", &v.ObjectMeta), true + case *appsv1.Deployment: + return identFromMeta("Deployment", "apps", &v.ObjectMeta), true + case *appsv1.DaemonSet: + return identFromMeta("DaemonSet", "apps", &v.ObjectMeta), true + case *appsv1.StatefulSet: + return identFromMeta("StatefulSet", "apps", &v.ObjectMeta), true + case *appsv1.ReplicaSet: + return identFromMeta("ReplicaSet", "apps", &v.ObjectMeta), true + case *autoscalingv2.HorizontalPodAutoscaler: + return identFromMeta("HorizontalPodAutoscaler", "autoscaling", &v.ObjectMeta), true + case *batchv1.Job: + return identFromMeta("Job", "batch", &v.ObjectMeta), true + case *batchv1.CronJob: + return identFromMeta("CronJob", "batch", &v.ObjectMeta), true + case *networkingv1.Ingress: + return identFromMeta("Ingress", "networking.k8s.io", &v.ObjectMeta), true + case *networkingv1.NetworkPolicy: + return identFromMeta("NetworkPolicy", "networking.k8s.io", &v.ObjectMeta), true + case *policyv1.PodDisruptionBudget: + return identFromMeta("PodDisruptionBudget", "policy", &v.ObjectMeta), true + case *storagev1.StorageClass: + return identFromMeta("StorageClass", "storage.k8s.io", &v.ObjectMeta), true + case *rbacv1.Role: + return identFromMeta("Role", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRole: + return identFromMeta("ClusterRole", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.RoleBinding: + return identFromMeta("RoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRoleBinding: + return identFromMeta("ClusterRoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *unstructured.Unstructured: + gvk := v.GroupVersionKind() + return resourceIdentity{ + Kind: gvk.Kind, + Group: gvk.Group, + Namespace: v.GetNamespace(), + Name: v.GetName(), + Labels: v.GetLabels(), + Annotations: v.GetAnnotations(), + Owners: v.GetOwnerReferences(), + }, true + } + return resourceIdentity{}, false +} + +func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { + return resourceIdentity{ + Kind: kind, + Group: group, + Namespace: m.Namespace, + Name: m.Name, + Labels: m.Labels, + Annotations: m.Annotations, + Owners: m.OwnerReferences, + } +} + +// --------------------------------------------------------------------------- +// ManagedBy detection +// --------------------------------------------------------------------------- + +// GitOps label/annotation keys — kept in sync with packages/k8s-ui/src/utils/gitops-owner.ts. +const ( + argoTrackingIDAnnotation = "argocd.argoproj.io/tracking-id" + argoInstanceLabel = "argocd.argoproj.io/instance" + fluxKustomizeNameLabel = "kustomize.toolkit.fluxcd.io/name" + fluxKustomizeNSLabel = "kustomize.toolkit.fluxcd.io/namespace" + fluxHelmNameLabel = "helm.toolkit.fluxcd.io/name" + fluxHelmNSLabel = "helm.toolkit.fluxcd.io/namespace" +) + +// buildManagedBy returns the ContextRefs describing what manages this +// resource. Precedence (most-specific wins): +// 1. Flux HelmRelease labels +// 2. Flux Kustomization labels +// 3. Argo tracking-id annotation +// 4. Argo instance label +// 5. First owner reference (controller=true preferred) +// +// Only one path emits today — the field is a slice so future taxonomies +// (e.g. dual ArgoCD + Flux) can list multiple managers without a wire change. +func buildManagedBy(ident resourceIdentity) []ContextRef { + if name, ns, ok := readPair(ident.Labels, fluxHelmNameLabel, fluxHelmNSLabel); ok { + return []ContextRef{{ + Kind: "HelmRelease", + Group: "helm.toolkit.fluxcd.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + if name, ns, ok := readPair(ident.Labels, fluxKustomizeNameLabel, fluxKustomizeNSLabel); ok { + return []ContextRef{{ + Kind: "Kustomization", + Group: "kustomize.toolkit.fluxcd.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + if id := ident.Annotations[argoTrackingIDAnnotation]; id != "" { + if ns, name, ok := parseArgoTrackingID(id); ok && name != "" { + return []ContextRef{{ + Kind: "Application", + Group: "argoproj.io", + Namespace: ns, + Name: name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + } + if inst := ident.Labels[argoInstanceLabel]; inst != "" { + // App namespace unknown without tracking-id — emit with empty ns + // like the UI does; the consumer decides whether to navigate. + return []ContextRef{{ + Kind: "Application", + Group: "argoproj.io", + Name: inst, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + + if owner := pickControllerOwner(ident.Owners); owner != nil { + group := groupFromAPIVersion(owner.APIVersion) + return []ContextRef{{ + Kind: owner.Kind, + Group: group, + Namespace: ident.Namespace, + Name: owner.Name, + Reason: ReasonOwnerReference, + Source: SourceOwnerChain, + }} + } + return nil +} + +func readPair(m map[string]string, k1, k2 string) (string, string, bool) { + a := m[k1] + b := m[k2] + if a == "" || b == "" { + return "", "", false + } + return a, b, true +} + +// parseArgoTrackingID mirrors gitops-owner.ts. Two forms: +// +// ":..." (legacy, single name) +// "_:..." (namespaced install) +// +// Returns (ns, name, ok). +func parseArgoTrackingID(value string) (string, string, bool) { + colon := strings.IndexByte(value, ':') + if colon < 0 { + return "", "", false + } + head := value[:colon] + if head == "" { + return "", "", false + } + if sep := strings.IndexByte(head, '_'); sep >= 0 { + ns := head[:sep] + name := head[sep+1:] + if name == "" { + return "", "", false + } + return ns, name, true + } + return "", head, true +} + +// pickControllerOwner returns the first owner with Controller=true; falls +// back to the first owner if none are marked controller. Returns nil when +// the slice is empty. +func pickControllerOwner(owners []metav1.OwnerReference) *metav1.OwnerReference { + for i := range owners { + if owners[i].Controller != nil && *owners[i].Controller { + return &owners[i] + } + } + if len(owners) > 0 { + return &owners[0] + } + return nil +} + +// groupFromAPIVersion extracts the group from "group/version" or "version" +// (core/v1 form). Mirrors schema.ParseGroupVersion without the import. +func groupFromAPIVersion(apiVersion string) string { + if i := strings.IndexByte(apiVersion, '/'); i >= 0 { + return apiVersion[:i] + } + return "" +} + +// --------------------------------------------------------------------------- +// Uses (Pod-specific) +// --------------------------------------------------------------------------- + +// buildUsesFromPod extracts ConfigMap/Secret/PVC/ServiceAccount references +// from pod.Spec. Returns nil when the pod uses no configuration. +// +// Sources scanned: +// - Volumes: ConfigMap / Secret / PVC / Projected (configMap + secret entries) +// - Containers (init + regular): EnvFrom configMapRef/secretRef, Env valueFrom.{configMap,secret}KeyRef +// - Spec.ServiceAccountName +func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + if pod == nil { + return nil + } + + cmSet := newRefSet() + secretSet := newRefSet() + pvcSet := newRefSet() + + scanVolumes(pod.Spec.Volumes, pod.Namespace, cmSet, secretSet, pvcSet) + scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) + scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) + + uses := &UsesBlock{ + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonEnvVarRef, SourceK8sSpec), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonVolumeMount, SourceK8sSpec), "uses.secrets", omitted), + PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), + } + + if sa := pod.Spec.ServiceAccountName; sa != "" { + candidate := &ContextRef{ + Kind: "ServiceAccount", + Namespace: pod.Namespace, + Name: sa, + Reason: ReasonSAName, + Source: SourceK8sSpec, + } + if checkRef(ctx, ac, candidate) { + uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + if len(uses.ConfigMaps) == 0 && len(uses.Secrets) == 0 && len(uses.PVCs) == 0 && uses.ServiceAccount == nil { + return nil + } + return uses +} + +func scanVolumes(vols []corev1.Volume, ns string, cm, secret, pvc *refSet) { + for _, v := range vols { + if v.ConfigMap != nil { + cm.add(v.ConfigMap.Name, ns) + } + if v.Secret != nil { + secret.add(v.Secret.SecretName, ns) + } + if v.PersistentVolumeClaim != nil { + pvc.add(v.PersistentVolumeClaim.ClaimName, ns) + } + if v.Projected != nil { + for _, src := range v.Projected.Sources { + if src.ConfigMap != nil { + cm.add(src.ConfigMap.Name, ns) + } + if src.Secret != nil { + secret.add(src.Secret.Name, ns) + } + } + } + } +} + +func scanContainers(containers []corev1.Container, ns string, cm, secret *refSet) { + for _, c := range containers { + for _, ef := range c.EnvFrom { + if ef.ConfigMapRef != nil { + cm.add(ef.ConfigMapRef.Name, ns) + } + if ef.SecretRef != nil { + secret.add(ef.SecretRef.Name, ns) + } + } + for _, e := range c.Env { + if e.ValueFrom == nil { + continue + } + if e.ValueFrom.ConfigMapKeyRef != nil { + cm.add(e.ValueFrom.ConfigMapKeyRef.Name, ns) + } + if e.ValueFrom.SecretKeyRef != nil { + secret.add(e.ValueFrom.SecretKeyRef.Name, ns) + } + } + } +} + +// refSet collects (name, namespace) pairs with insertion-order preservation +// for deterministic output. Names with empty namespaces are tolerated (the +// PVC ClaimName can be cluster-scoped only in odd configurations, but we +// pass through whatever the pod spec says). +type refSet struct { + seen map[string]bool + order []nsName +} + +type nsName struct { + Namespace string + Name string +} + +func newRefSet() *refSet { + return &refSet{seen: make(map[string]bool)} +} + +func (s *refSet) add(name, ns string) { + if name == "" { + return + } + key := ns + "/" + name + if s.seen[key] { + return + } + s.seen[key] = true + s.order = append(s.order, nsName{Namespace: ns, Name: name}) +} + +// refs returns the accumulated set as ContextRefs sorted by (namespace, name) +// for deterministic golden output. +func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) []ContextRef { + if len(s.order) == 0 { + return nil + } + out := make([]ContextRef, len(s.order)) + sorted := append([]nsName(nil), s.order...) + sort.Slice(sorted, func(i, j int) bool { + if sorted[i].Namespace != sorted[j].Namespace { + return sorted[i].Namespace < sorted[j].Namespace + } + return sorted[i].Name < sorted[j].Name + }) + for i, e := range sorted { + out[i] = ContextRef{ + Kind: kind, + Group: group, + Namespace: e.Namespace, + Name: e.Name, + Reason: reason, + Source: source, + } + } + return out +} + +// --------------------------------------------------------------------------- +// Topology ref → ContextRef +// --------------------------------------------------------------------------- + +// toContextRefs translates a slice of topology.ResourceRef into ContextRefs +// with the given reason+source. Sorted by (kind, namespace, name) for +// determinism — golden tests rely on this ordering. +func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSource) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + for _, r := range refs { + out = append(out, ContextRef{ + Kind: r.Kind, + Group: r.Group, + Namespace: r.Namespace, + Name: r.Name, + Reason: reason, + Source: source, + }) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Kind != out[j].Kind { + return out[i].Kind < out[j].Kind + } + if out[i].Namespace != out[j].Namespace { + return out[i].Namespace < out[j].Namespace + } + return out[i].Name < out[j].Name + }) + return out +} + +// --------------------------------------------------------------------------- +// RBAC gating +// --------------------------------------------------------------------------- + +// filterRefs applies the access check to each ref. Denied refs are dropped +// and one omitted entry is recorded per field (deduped by the tracker). +// When ac is nil (local-kubeconfig / no auth), every ref passes. +func filterRefs(ctx context.Context, ac RefAccessChecker, refs []ContextRef, fieldPath string, omitted *omittedTracker) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + deniedAny := false + for _, r := range refs { + if !checkRef(ctx, ac, &r) { + deniedAny = true + continue + } + out = append(out, r) + } + if deniedAny { + omitted.add(fieldPath, OmittedRBACDenied) + } + if len(out) == 0 { + return nil + } + return out +} + +// checkRef returns true when ac permits a read of (group, kind, namespace). +// Nil ac = permit everything. +func checkRef(ctx context.Context, ac RefAccessChecker, r *ContextRef) bool { + if ac == nil || r == nil { + return true + } + return ac.CanRead(ctx, r.Group, r.Kind, r.Namespace) +} + +// --------------------------------------------------------------------------- +// Policy summary +// --------------------------------------------------------------------------- + +// buildPolicySummary rolls up Kyverno findings into the summary block. +// Top findings are picked first by fail > warn > error > pass, then by +// stable input order — callers can prune to MAX (3 today). +const policySummaryTopMax = 3 + +func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { + var fail, warn, pass int + for _, f := range findings { + switch f.Result { + case "fail": + fail++ + case "warn": + warn++ + case "pass": + pass++ + } + } + + // Order Top by result priority; cap to policySummaryTopMax. + ordered := append([]KyvernoFinding(nil), findings...) + sort.SliceStable(ordered, func(i, j int) bool { + return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) + }) + if len(ordered) > policySummaryTopMax { + ordered = ordered[:policySummaryTopMax] + } + + return &PolicySummary{ + Kyverno: &KyvernoSummary{ + Fail: fail, + Warn: warn, + Pass: pass, + Top: ordered, + }, + } +} + +func resultRank(r string) int { + switch r { + case "fail": + return 0 + case "warn": + return 1 + case "error": + return 2 + case "pass": + return 3 + default: + return 4 + } +} + +// --------------------------------------------------------------------------- +// Omitted tracker +// --------------------------------------------------------------------------- + +// omittedTracker deduplicates (field, reason) entries so callers don't emit +// "managedBy" / OmittedRBACDenied twice when multiple refs in the same field +// fail. Insertion order is preserved for stable JSON output. +type omittedTracker struct { + seen map[string]bool + items []OmittedField +} + +func newOmittedTracker() *omittedTracker { + return &omittedTracker{seen: make(map[string]bool)} +} + +func (t *omittedTracker) add(field string, reason OmittedReason) { + key := field + "|" + string(reason) + if t.seen[key] { + return + } + t.seen[key] = true + t.items = append(t.items, OmittedField{Field: field, Reason: reason}) +} + +func (t *omittedTracker) collect() []OmittedField { + if len(t.items) == 0 { + return nil + } + return t.items +} diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go new file mode 100644 index 000000000..503fee9d9 --- /dev/null +++ b/pkg/resourcecontext/build_test.go @@ -0,0 +1,590 @@ +package resourcecontext + +import ( + "context" + "encoding/json" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// --------------------------------------------------------------------------- +// Test scaffolding +// --------------------------------------------------------------------------- + +// allowAllChecker permits every CanRead check. Used by the happy-path +// goldens that don't exercise RBAC denial. +type allowAllChecker struct{} + +func (allowAllChecker) CanRead(_ context.Context, _, _, _ string) bool { return true } + +// denyChecker denies a specific (group, kind, namespace) tuple and permits +// everything else. Tests the "omitted: rbac_denied" path without requiring +// the full server stack. +type denyChecker struct { + group string + kind string + namespace string +} + +func (d denyChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + return !(group == d.group && kind == d.kind && namespace == d.namespace) +} + +// mockPolicyReports implements PolicyReportLookup. +type mockPolicyReports map[string][]KyvernoFinding + +func (m mockPolicyReports) FindingsFor(kind, namespace, name string) []KyvernoFinding { + return m[kind+"/"+namespace+"/"+name] +} + +// --------------------------------------------------------------------------- +// Golden-file tests +// --------------------------------------------------------------------------- + +func TestBuild_Pod_FullEnrichment(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-abc", + Namespace: "prod", + Labels: map[string]string{ + "app.kubernetes.io/name": "web", + }, + Annotations: map[string]string{ + argoTrackingIDAnnotation: "argocd_storefront:apps/Deployment:prod/web", + }, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "web-7d", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + ServiceAccountName: "web-sa", + Volumes: []corev1.Volume{ + { + Name: "config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "web-config"}, + }, + }, + }, + { + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }, + { + Name: "data", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "web-data"}, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "web", + EnvFrom: []corev1.EnvFromSource{ + {ConfigMapRef: &corev1.ConfigMapEnvSource{LocalObjectReference: corev1.LocalObjectReference{Name: "shared-env"}}}, + }, + Env: []corev1.EnvVar{ + { + Name: "API_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-key-secret"}, + Key: "key", + }, + }, + }, + }, + }, + }, + }, + } + + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/web-abc", Kind: topology.KindPod, Name: "web-abc"}, + {ID: "service/prod/web", Kind: topology.KindService, Name: "web"}, + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "poddisruptionbudget/prod/web-pdb", Kind: topology.KindPDB, Name: "web-pdb"}, + {ID: "horizontalpodautoscaler/prod/web-hpa", Kind: topology.KindHPA, Name: "web-hpa"}, + }, + Edges: []topology.Edge{ + {Source: "service/prod/web", Target: "pod/prod/web-abc", Type: topology.EdgeRoutesTo}, + {Source: "networkpolicy/prod/default-deny", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "poddisruptionbudget/prod/web-pdb", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "horizontalpodautoscaler/prod/web-hpa", Target: "pod/prod/web-abc", Type: topology.EdgeUses}, + }, + } + + opts := Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + IssueSummary: &IssueSummary{ + Count: 1, HighestSeverity: "critical", TopReason: "ImagePullBackOff", + BySource: map[string]int{"problem": 1}, + }, + } + + rc := Build(context.Background(), pod, opts) + if rc == nil { + t.Fatal("Build returned nil") + } + + // ManagedBy: argo tracking-id annotation wins over owner reference. + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d (%+v)", got, want, rc.ManagedBy) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Application" || mb.Name != "storefront" || mb.Namespace != "argocd" { + t.Errorf("ManagedBy[0]: got %+v, want Application argocd/storefront", mb) + } + if mb.Source != SourceOwnerChain { + t.Errorf("ManagedBy[0].Source: got %q want %q", mb.Source, SourceOwnerChain) + } + + // Exposes: the Service routes to the pod. + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d (%+v)", got, want, rc.Exposes) + } + if rc.Exposes[0].Kind != "Service" || rc.Exposes[0].Name != "web" { + t.Errorf("Exposes[0]: got %+v want Service/prod/web", rc.Exposes[0]) + } + + // SelectedBy: NP + PDB, sorted by kind (NetworkPolicy < PodDisruptionBudget). + if got, want := len(rc.SelectedBy), 2; got != want { + t.Fatalf("SelectedBy len: got %d want %d (%+v)", got, want, rc.SelectedBy) + } + if rc.SelectedBy[0].Kind != "NetworkPolicy" || rc.SelectedBy[1].Kind != "PodDisruptionBudget" { + t.Errorf("SelectedBy order: got %s,%s want NetworkPolicy,PodDisruptionBudget", + rc.SelectedBy[0].Kind, rc.SelectedBy[1].Kind) + } + + // ScaledBy: HPA. + if got, want := len(rc.ScaledBy), 1; got != want { + t.Fatalf("ScaledBy len: got %d want %d", got, want) + } + if rc.ScaledBy[0].Kind != "HorizontalPodAutoscaler" { + t.Errorf("ScaledBy[0].Kind: got %q", rc.ScaledBy[0].Kind) + } + + // RunsOn: Node. + if rc.RunsOn == nil || rc.RunsOn.Name != "node-1" { + t.Errorf("RunsOn: got %+v want Node/node-1", rc.RunsOn) + } + + // Uses: 2 ConfigMaps (web-config + shared-env), 2 Secrets (web-creds + api-key-secret), 1 PVC, ServiceAccount. + if rc.Uses == nil { + t.Fatal("Uses: got nil") + } + if got, want := len(rc.Uses.ConfigMaps), 2; got != want { + t.Errorf("Uses.ConfigMaps len: got %d want %d (%+v)", got, want, rc.Uses.ConfigMaps) + } + if got, want := len(rc.Uses.Secrets), 2; got != want { + t.Errorf("Uses.Secrets len: got %d want %d (%+v)", got, want, rc.Uses.Secrets) + } + if got, want := len(rc.Uses.PVCs), 1; got != want { + t.Errorf("Uses.PVCs len: got %d want %d", got, want) + } + if rc.Uses.ServiceAccount == nil || rc.Uses.ServiceAccount.Name != "web-sa" { + t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) + } + + // Hints: deterministic ordering covers the high-signal fields. + wantHints := []string{ + "Managed by Application storefront", + "1 issue (critical: ImagePullBackOff)", + "Running on node node-1", + "Exposed by 1 Service", + "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", + "Scaled by 1 HorizontalPodAutoscaler", + "Uses 2 ConfigMaps, 2 Secrets, 1 PVC, ServiceAccount web-sa", + } + if !reflect.DeepEqual(rc.Hints, wantHints) { + t.Errorf("Hints mismatch.\n got: %v\nwant: %v", rc.Hints, wantHints) + } + + // Pre-computed summaries are passed through. + if rc.IssueSummary == nil || rc.IssueSummary.Count != 1 { + t.Errorf("IssueSummary not passed through: %+v", rc.IssueSummary) + } + if rc.AuditSummary != nil { + t.Errorf("AuditSummary: want nil, got %+v", rc.AuditSummary) + } +} + +func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { + // Flux HelmRelease labels take precedence over owner references — + // owner is "ReplicaSet web-7d" but Flux labels point at HelmRelease. + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web", + Namespace: "prod", + Labels: map[string]string{ + fluxHelmNameLabel: "web", + fluxHelmNSLabel: "flux-system", + }, + }, + } + + rc := Build(context.Background(), dep, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "HelmRelease" || mb.Name != "web" || mb.Namespace != "flux-system" { + t.Errorf("ManagedBy[0]: got %+v want HelmRelease/flux-system/web", mb) + } + if mb.Group != "helm.toolkit.fluxcd.io" { + t.Errorf("ManagedBy[0].Group: got %q", mb.Group) + } + wantHint := "Managed by HelmRelease web" + if len(rc.Hints) == 0 || rc.Hints[0] != wantHint { + t.Errorf("first Hint: got %v want %q", rc.Hints, wantHint) + } +} + +func TestBuild_Service_ExposedByIngress(t *testing.T) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "service/prod/api", Kind: topology.KindService, Name: "api"}, + {ID: "ingress/prod/api-ingress", Kind: topology.KindIngress, Name: "api-ingress"}, + }, + Edges: []topology.Edge{ + {Source: "ingress/prod/api-ingress", Target: "service/prod/api", Type: topology.EdgeRoutesTo}, + }, + } + rc := Build(context.Background(), svc, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + }) + + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d", got, want) + } + if rc.Exposes[0].Kind != "Ingress" || rc.Exposes[0].Name != "api-ingress" { + t.Errorf("Exposes[0]: got %+v", rc.Exposes[0]) + } + // Service has no Uses block — make sure we don't synthesize an empty one. + if rc.Uses != nil { + t.Errorf("Uses should be nil for Service: got %+v", rc.Uses) + } +} + +func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { + // NetworkPolicy on the "policy side" emits an outgoing EdgeProtects to + // the workload it selects. The topology relationships projection does + // NOT surface that direction (see relationships.go's intentional skip). + // Build inherits this — the NP should have nothing in SelectedBy. + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default-deny", Namespace: "prod"}, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "networkpolicy/prod/default-deny", Target: "deployment/prod/web", Type: topology.EdgeProtects}, + }, + } + rc := Build(context.Background(), np, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + EmitHints: true, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if len(rc.SelectedBy) != 0 { + t.Errorf("SelectedBy: expected empty (outgoing EdgeProtects not surfaced), got %+v", rc.SelectedBy) + } +} + +func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { + // A ConfigMap with a controller owner reference. No topology, no Pod + // spec — just owner-chain ManagedBy. + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-config", + Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "Deployment", APIVersion: "apps/v1", Name: "web", Controller: ptrBool(true)}, + }, + }, + } + rc := Build(context.Background(), cm, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" || mb.Group != "apps" { + t.Errorf("ManagedBy[0]: got %+v", mb) + } +} + +func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { + // Deny reads on Secrets in the pod's namespace — buildUsesFromPod + // should drop them all and emit an omitted entry. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}, + Spec: corev1.PodSpec{ + Volumes: []corev1.Volume{{ + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: denyChecker{group: "", kind: "Secret", namespace: "prod"}, + EmitHints: true, + }) + if rc.Uses != nil && len(rc.Uses.Secrets) != 0 { + t.Errorf("Secrets should be empty after deny; got %+v", rc.Uses.Secrets) + } + gotOmitted := false + for _, o := range rc.Omitted { + if o.Field == "uses.secrets" && o.Reason == OmittedRBACDenied { + gotOmitted = true + break + } + } + if !gotOmitted { + t.Errorf("expected omitted [uses.secrets, rbac_denied]; got %+v", rc.Omitted) + } +} + +func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{{Kind: "Foo", APIVersion: "ex.io/v1", Name: "f", Controller: ptrBool(true)}}}, + } + rc := Build(context.Background(), dep, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: false, + }) + if len(rc.Hints) != 0 { + t.Errorf("EmitHints=false but got hints: %v", rc.Hints) + } + // Structured fields still populated. + if len(rc.ManagedBy) != 1 { + t.Errorf("ManagedBy should still be populated: %+v", rc.ManagedBy) + } +} + +func TestBuild_NilObj(t *testing.T) { + if rc := Build(context.Background(), nil, Options{}); rc != nil { + t.Errorf("Build(nil) = %+v, want nil", rc) + } +} + +func TestBuild_HPA_Identity(t *testing.T) { + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "web-hpa", Namespace: "prod"}, + } + rc := Build(context.Background(), hpa, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc == nil { + t.Fatal("Build returned nil for HPA") + } + if rc.Tier != TierBasic { + t.Errorf("Tier: got %q want %q", rc.Tier, TierBasic) + } +} + +func TestBuild_PolicyReports_RolledUp(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + EmitHints: true, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) == 0 || k.Top[0].Result != "fail" { + t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + } + gotHint := false + for _, h := range rc.Hints { + if h == "Kyverno: 1 failing, 1 warning" { + gotHint = true + break + } + } + if !gotHint { + t.Errorf("expected Kyverno hint; got %v", rc.Hints) + } +} + +func TestBuild_PDB_OutputJSONShape(t *testing.T) { + // Pin the wire shape one full populated Build produces, so a future + // reorder of fields (or accidental omitempty change) is caught. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "rs", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{NodeName: "n1"}, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + EmitHints: true, + }) + b, err := json.MarshalIndent(rc, "", " ") + if err != nil { + t.Fatalf("marshal: %v", err) + } + // Spot-check: tier basic, owner ref managedBy, runsOn node. + want := `"managedBy"` + if !contains(string(b), want) { + t.Errorf("JSON missing %s\n%s", want, b) + } + if !contains(string(b), `"tier": "basic"`) { + t.Errorf("JSON missing tier=basic\n%s", b) + } + if !contains(string(b), `"runsOn"`) { + t.Errorf("JSON missing runsOn\n%s", b) + } +} + +// --------------------------------------------------------------------------- +// Sub-helpers' unit coverage +// --------------------------------------------------------------------------- + +func TestParseArgoTrackingID(t *testing.T) { + cases := []struct { + in string + wantNS string + wantName string + wantOK bool + shortName string + }{ + {"argocd_store:apps/Deployment:prod/web", "argocd", "store", true, "namespaced form"}, + {"store:apps/Deployment:prod/web", "", "store", true, "legacy form"}, + {"", "", "", false, "empty"}, + {":foo/bar", "", "", false, "missing head"}, + {"a_:foo", "", "", false, "missing name"}, + } + for _, c := range cases { + t.Run(c.shortName, func(t *testing.T) { + ns, name, ok := parseArgoTrackingID(c.in) + if ns != c.wantNS || name != c.wantName || ok != c.wantOK { + t.Errorf("parseArgoTrackingID(%q) = (%q, %q, %v) want (%q, %q, %v)", + c.in, ns, name, ok, c.wantNS, c.wantName, c.wantOK) + } + }) + } +} + +func TestGroupFromAPIVersion(t *testing.T) { + cases := map[string]string{ + "v1": "", + "apps/v1": "apps", + "argoproj.io/v1alpha1": "argoproj.io", + "networking.k8s.io/v1": "networking.k8s.io", + "helm.toolkit.fluxcd.io/v2beta1": "helm.toolkit.fluxcd.io", + } + for in, want := range cases { + if got := groupFromAPIVersion(in); got != want { + t.Errorf("groupFromAPIVersion(%q) = %q, want %q", in, got, want) + } + } +} + +func TestPickControllerOwner_PrefersController(t *testing.T) { + owners := []metav1.OwnerReference{ + {Kind: "Other", Name: "x"}, + {Kind: "Boss", Name: "ctrl", Controller: ptrBool(true)}, + } + got := pickControllerOwner(owners) + if got == nil || got.Name != "ctrl" { + t.Errorf("got %+v, want ctrl", got) + } +} + +func TestPickControllerOwner_FallsBackToFirst(t *testing.T) { + owners := []metav1.OwnerReference{ + {Kind: "Solo", Name: "first"}, + {Kind: "Other", Name: "x"}, + } + got := pickControllerOwner(owners) + if got == nil || got.Name != "first" { + t.Errorf("got %+v, want first", got) + } + if got := pickControllerOwner(nil); got != nil { + t.Errorf("nil owners should return nil, got %+v", got) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func ptrBool(b bool) *bool { return &b } + +func contains(s, sub string) bool { + return len(s) >= len(sub) && indexOf(s, sub) >= 0 +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} + +// Compile-time pin: keep PDB and Networking imports referenced for future tests. +var ( + _ = policyv1.PodDisruptionBudget{} +) diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go new file mode 100644 index 000000000..30ccd698b --- /dev/null +++ b/pkg/resourcecontext/hints.go @@ -0,0 +1,232 @@ +package resourcecontext + +import ( + "fmt" + "sort" + "strings" +) + +// SynthesizeHints renders a short, deterministic prose summary of the +// structured fields in c. Returns at most maxHintsBasic lines for +// TierBasic; future tiers can expand the budget. +// +// Ordering is fixed (not data-driven) so golden tests stay stable across +// runs. No LLM is involved — every line maps to a single rule. +// +// Callers SHOULD NOT parse hints — the structured fields are the canonical +// surface. Hints exist solely as a prose convenience for AI consumers. +func SynthesizeHints(c *ResourceContext, tier ContextTier) []string { + if c == nil { + return nil + } + + max := maxHintsBasic + if tier == TierDiagnostic { + max = maxHintsDiagnostic + } + + out := make([]string, 0, max) + + if h := managedByHint(c.ManagedBy); h != "" { + out = append(out, h) + } + if h := issueHint(c.IssueSummary); h != "" { + out = append(out, h) + } + if h := auditHint(c.AuditSummary); h != "" { + out = append(out, h) + } + if h := runsOnHint(c.RunsOn); h != "" { + out = append(out, h) + } + if h := exposesHint(c.Exposes); h != "" { + out = append(out, h) + } + if h := selectedByHint(c.SelectedBy); h != "" { + out = append(out, h) + } + if h := scaledByHint(c.ScaledBy); h != "" { + out = append(out, h) + } + if h := usesHint(c.Uses); h != "" { + out = append(out, h) + } + if h := policyHint(c.PolicySummary); h != "" { + out = append(out, h) + } + + if len(out) > max { + out = out[:max] + } + if len(out) == 0 { + return nil + } + return out +} + +const ( + maxHintsBasic = 8 + maxHintsDiagnostic = 12 +) + +func managedByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + m := refs[0] + return fmt.Sprintf("Managed by %s %s", m.Kind, m.Name) +} + +func issueHint(s *IssueSummary) string { + if s == nil || s.Count == 0 { + return "" + } + noun := pluralize("issue", s.Count) + var b strings.Builder + fmt.Fprintf(&b, "%d %s", s.Count, noun) + if s.HighestSeverity != "" { + fmt.Fprintf(&b, " (%s", s.HighestSeverity) + if s.TopReason != "" { + fmt.Fprintf(&b, ": %s", s.TopReason) + } + b.WriteString(")") + } else if s.TopReason != "" { + fmt.Fprintf(&b, ": %s", s.TopReason) + } + return b.String() +} + +func auditHint(s *AuditSummary) string { + if s == nil || s.Count == 0 { + return "" + } + noun := pluralize("audit finding", s.Count) + if s.HighestSeverity == "" { + return fmt.Sprintf("%d %s", s.Count, noun) + } + return fmt.Sprintf("%d %s (%s)", s.Count, noun, s.HighestSeverity) +} + +func runsOnHint(r *ContextRef) string { + if r == nil { + return "" + } + return fmt.Sprintf("Running on node %s", r.Name) +} + +func exposesHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + return fmt.Sprintf("Exposed by %s", summarizeKindsCounts(refs)) +} + +func selectedByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + // Distinguish PDB vs NetworkPolicy in the hint — they read very + // differently to a human, and lumping them together loses signal. + var pdb, np []ContextRef + for _, r := range refs { + if r.Kind == "PodDisruptionBudget" { + pdb = append(pdb, r) + } else { + np = append(np, r) + } + } + parts := make([]string, 0, 2) + if n := len(np); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) + } + if n := len(pdb); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) + } + return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) +} + +func selectVerb(n int) string { + if n == 1 { + return "selects this resource" + } + return "select this resource" +} + +func scaledByHint(refs []ContextRef) string { + if len(refs) == 0 { + return "" + } + return fmt.Sprintf("Scaled by %s", summarizeKindsCounts(refs)) +} + +func usesHint(u *UsesBlock) string { + if u == nil { + return "" + } + parts := make([]string, 0, 4) + if n := len(u.ConfigMaps); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("ConfigMap", n))) + } + if n := len(u.Secrets); n > 0 { + parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("Secret", n))) + } + if n := len(u.PVCs); n > 0 { + parts = append(parts, fmt.Sprintf("%d PVCs", n)) + if n == 1 { + parts[len(parts)-1] = "1 PVC" + } + } + if u.ServiceAccount != nil { + parts = append(parts, fmt.Sprintf("ServiceAccount %s", u.ServiceAccount.Name)) + } + if len(parts) == 0 { + return "" + } + return "Uses " + strings.Join(parts, ", ") +} + +func policyHint(s *PolicySummary) string { + if s == nil || s.Kyverno == nil { + return "" + } + k := s.Kyverno + if k.Fail == 0 && k.Warn == 0 { + return "" + } + parts := make([]string, 0, 2) + if k.Fail > 0 { + parts = append(parts, fmt.Sprintf("%d failing", k.Fail)) + } + if k.Warn > 0 { + parts = append(parts, fmt.Sprintf("%d warning", k.Warn)) + } + return "Kyverno: " + strings.Join(parts, ", ") +} + +// summarizeKindsCounts groups refs by kind and emits "N Kind, M OtherKind" +// (deterministic order: alphabetical by kind). +func summarizeKindsCounts(refs []ContextRef) string { + counts := make(map[string]int) + for _, r := range refs { + counts[r.Kind]++ + } + kinds := make([]string, 0, len(counts)) + for k := range counts { + kinds = append(kinds, k) + } + sort.Strings(kinds) + parts := make([]string, 0, len(kinds)) + for _, k := range kinds { + parts = append(parts, fmt.Sprintf("%d %s", counts[k], pluralize(k, counts[k]))) + } + return strings.Join(parts, ", ") +} + +// pluralize returns word + "s" when n != 1. Kept English-only; resource +// kinds are loanwords (Pod, Service, etc.) so naive pluralization works. +func pluralize(word string, n int) string { + if n == 1 { + return word + } + return word + "s" +} diff --git a/pkg/resourcecontext/hints_test.go b/pkg/resourcecontext/hints_test.go new file mode 100644 index 000000000..8bc0a30d6 --- /dev/null +++ b/pkg/resourcecontext/hints_test.go @@ -0,0 +1,118 @@ +package resourcecontext + +import ( + "reflect" + "testing" +) + +func TestSynthesizeHints_NilCtx(t *testing.T) { + if got := SynthesizeHints(nil, TierBasic); got != nil { + t.Errorf("nil ctx: got %v, want nil", got) + } +} + +func TestSynthesizeHints_EmptyCtx(t *testing.T) { + rc := &ResourceContext{Tier: TierBasic} + got := SynthesizeHints(rc, TierBasic) + if got != nil { + t.Errorf("empty rc: got %v, want nil", got) + } +} + +func TestSynthesizeHints_DeterministicOrdering(t *testing.T) { + rc := &ResourceContext{ + ManagedBy: []ContextRef{{Kind: "Application", Name: "store"}}, + Exposes: []ContextRef{{Kind: "Service", Name: "api"}}, + SelectedBy: []ContextRef{ + {Kind: "NetworkPolicy", Name: "deny"}, + {Kind: "PodDisruptionBudget", Name: "pdb"}, + }, + ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "hpa"}}, + RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, + Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}}, + IssueSummary: &IssueSummary{Count: 2, HighestSeverity: "warning", TopReason: "Backoff"}, + AuditSummary: &AuditSummary{Count: 3, HighestSeverity: "danger"}, + } + want := []string{ + "Managed by Application store", + "2 issues (warning: Backoff)", + "3 audit findings (danger)", + "Running on node n1", + "Exposed by 1 Service", + "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", + "Scaled by 1 HorizontalPodAutoscaler", + "Uses 1 ConfigMap", + } + got := SynthesizeHints(rc, TierBasic) + if !reflect.DeepEqual(got, want) { + t.Errorf("hints mismatch:\n got: %v\nwant: %v", got, want) + } +} + +func TestSynthesizeHints_BasicTierCapped(t *testing.T) { + // Synthesize a maxed-out context and verify the basic tier caps at + // maxHintsBasic lines. This guards against unbounded hint growth. + rc := &ResourceContext{ + ManagedBy: []ContextRef{{Kind: "App", Name: "a"}}, + Exposes: []ContextRef{{Kind: "Service", Name: "svc"}}, + SelectedBy: []ContextRef{{Kind: "PodDisruptionBudget", Name: "p"}, {Kind: "NetworkPolicy", Name: "n"}}, + ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "h"}}, + RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, + Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}, Secrets: []ContextRef{{Kind: "Secret", Name: "s"}}}, + IssueSummary: &IssueSummary{Count: 1, HighestSeverity: "critical", TopReason: "Crash"}, + AuditSummary: &AuditSummary{Count: 1, HighestSeverity: "danger", TopFinding: "CKV_K8S_1"}, + PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Fail: 1, Warn: 1}}, + } + got := SynthesizeHints(rc, TierBasic) + if len(got) > maxHintsBasic { + t.Errorf("basic tier exceeded cap: got %d hints, want ≤%d (%v)", len(got), maxHintsBasic, got) + } +} + +func TestSynthesizeHints_IssueHint_NoSeverity(t *testing.T) { + rc := &ResourceContext{IssueSummary: &IssueSummary{Count: 1, TopReason: "Pending"}} + got := SynthesizeHints(rc, TierBasic) + want := []string{"1 issue: Pending"} + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} + +func TestSynthesizeHints_PolicyHint_OnlyPass_Skipped(t *testing.T) { + rc := &ResourceContext{PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Pass: 3}}} + got := SynthesizeHints(rc, TierBasic) + if got != nil { + t.Errorf("only-pass summary should not emit a hint; got %v", got) + } +} + +func TestUsesHint_PVCSingular(t *testing.T) { + rc := &ResourceContext{Uses: &UsesBlock{PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data"}}}} + got := SynthesizeHints(rc, TierBasic) + want := []string{"Uses 1 PVC"} + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} + +func TestSelectVerb(t *testing.T) { + if selectVerb(1) != "selects this resource" { + t.Errorf("verb(1): %q", selectVerb(1)) + } + if selectVerb(2) != "select this resource" { + t.Errorf("verb(2): %q", selectVerb(2)) + } +} + +func TestSummarizeKindsCounts_AlphabeticalOrder(t *testing.T) { + refs := []ContextRef{ + {Kind: "Service", Name: "a"}, + {Kind: "Ingress", Name: "b"}, + {Kind: "Service", Name: "c"}, + } + got := summarizeKindsCounts(refs) + want := "1 Ingress, 2 Services" + if got != want { + t.Errorf("got %q want %q", got, want) + } +} From 8c9e0b0b2ce9d01d65ac11dd99015de800408e8f Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 10:48:50 +0300 Subject: [PATCH 02/17] fix(ai-handlers): wire PolicyReport index into ResourceContext.Build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught that buildAIResourceContext never set opts.PolicyReports, so policySummary.kyverno stayed absent on the REST get-resource path even after T5 (index) and T11 (composer source). The Build generator already supports it via PolicyReportLookup; the handler just wasn't passing it. Adds a narrow policyReportLookupAdapter that wraps internal/k8s.GetPolicyReportIndex() — translating the richer pkg/policyreports.Finding shape (Severity + Category) into the agent-facing resourcecontext.KyvernoFinding shape (Policy / Rule / Result / Message). Keeping the projection narrow at the adapter layer means future additions to policyreports.Finding don't perturb the wire contract. When Kyverno isn't installed, GetPolicyReportIndex() returns nil and opts.PolicyReports stays unset — Build emits no policySummary, which is the correct degraded behavior. --- internal/server/ai_handlers.go | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 4db391ef2..35adde577 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -15,10 +15,42 @@ import ( "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" "github.com/skyhook-io/radar/pkg/resourcecontext" "github.com/skyhook-io/radar/pkg/topology" ) +// policyReportLookupAdapter wraps internal/k8s.GetPolicyReportIndex() into +// the resourcecontext.PolicyReportLookup interface, translating the +// richer pkg/policyreports.Finding shape (which carries Severity + +// Category) into the agent-facing resourcecontext.KyvernoFinding shape +// (Policy / Rule / Result / Message only). Keeping the projection narrow +// here lets unrelated changes to policyreports.Finding evolve without +// perturbing the wire contract that downstream callers depend on. +type policyReportLookupAdapter struct { + idx *policyreports.Index +} + +func (a policyReportLookupAdapter) FindingsFor(kind, namespace, name string) []resourcecontext.KyvernoFinding { + if a.idx == nil { + return nil + } + findings := a.idx.FindingsFor(kind, namespace, name) + if len(findings) == 0 { + return nil + } + out := make([]resourcecontext.KyvernoFinding, len(findings)) + for i, f := range findings { + out[i] = resourcecontext.KyvernoFinding{ + Policy: f.Policy, + Rule: f.Rule, + Result: f.Result, + Message: f.Message, + } + } + return out +} + // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aicontext.VerbosityLevel { switch r.URL.Query().Get("verbosity") { @@ -249,6 +281,13 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin AuditSummary: auditSum, } + // Wire the PolicyReport index when Kyverno is installed. Build emits a + // counts-only `policySummary.kyverno` on the basic tier; diagnostic + // tier (T10) will surface the top[] findings. + if idx := k8s.GetPolicyReportIndex(); idx != nil { + opts.PolicyReports = policyReportLookupAdapter{idx: idx} + } + if topo, prov, dyn, ok := s.topologyForContext(namespace); ok { opts.Topology = topo opts.Provider = prov From 2aec8ec38281ecd3e6d94f8234d6f57850ce2279 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 11:06:04 +0300 Subject: [PATCH 03/17] =?UTF-8?q?fix(resourcecontext):=20tier-aware=20Poli?= =?UTF-8?q?cySummary=20=E2=80=94=20basic=20emits=20counts=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught a plan-vs-code mismatch: the locked v1 contract says basic tier emits Kyverno counts only (fail/warn/pass), with Top[] findings reserved for diagnostic tier. The code emitted Top[] on both tiers, inflating basic-tier wire size with details that belong on the deeper agent investigation path. Fix: buildPolicySummary now takes ContextTier. Basic emits counts only; diagnostic adds the Top[] (cap 3, ordered fail > warn > error > pass). Existing TestBuild_PolicyReports_RolledUp split into two cases that pin both tier outputs. --- pkg/resourcecontext/build.go | 44 ++++++++++++++++++------------- pkg/resourcecontext/build_test.go | 40 +++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index c3d7374d4..1eaf9437f 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -143,11 +143,13 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte rc.IssueSummary = opts.IssueSummary rc.AuditSummary = opts.AuditSummary - // 5. PolicyReports — Kyverno findings rolled up. + // 5. PolicyReports — Kyverno findings rolled up. Basic tier emits + // counts only (fail/warn/pass); diagnostic tier adds the top[] + // findings. Tier discrimination keeps the basic-tier wire size tight. if opts.PolicyReports != nil { findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) if len(findings) > 0 { - rc.PolicySummary = buildPolicySummary(findings) + rc.PolicySummary = buildPolicySummary(findings, opts.Tier) } } @@ -639,10 +641,14 @@ func checkRef(ctx context.Context, ac RefAccessChecker, r *ContextRef) bool { // buildPolicySummary rolls up Kyverno findings into the summary block. // Top findings are picked first by fail > warn > error > pass, then by -// stable input order — callers can prune to MAX (3 today). +// stable input order — capped at policySummaryTopMax. +// +// Tier discrimination: basic emits counts only (Fail/Warn/Pass) for a +// minimal wire footprint; diagnostic adds the Top[] findings. Locked +// in the plan's v1 contract. const policySummaryTopMax = 3 -func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { +func buildPolicySummary(findings []KyvernoFinding, tier ContextTier) *PolicySummary { var fail, warn, pass int for _, f := range findings { switch f.Result { @@ -655,23 +661,25 @@ func buildPolicySummary(findings []KyvernoFinding) *PolicySummary { } } - // Order Top by result priority; cap to policySummaryTopMax. - ordered := append([]KyvernoFinding(nil), findings...) - sort.SliceStable(ordered, func(i, j int) bool { - return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) - }) - if len(ordered) > policySummaryTopMax { - ordered = ordered[:policySummaryTopMax] + ks := &KyvernoSummary{ + Fail: fail, + Warn: warn, + Pass: pass, } - return &PolicySummary{ - Kyverno: &KyvernoSummary{ - Fail: fail, - Warn: warn, - Pass: pass, - Top: ordered, - }, + // Top[] only on diagnostic tier. Basic stays counts-only. + if tier == TierDiagnostic { + ordered := append([]KyvernoFinding(nil), findings...) + sort.SliceStable(ordered, func(i, j int) bool { + return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) + }) + if len(ordered) > policySummaryTopMax { + ordered = ordered[:policySummaryTopMax] + } + ks.Top = ordered } + + return &PolicySummary{Kyverno: ks} } func resultRank(r string) int { diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 503fee9d9..bfdccccf8 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -425,7 +425,9 @@ func TestBuild_HPA_Identity(t *testing.T) { } } -func TestBuild_PolicyReports_RolledUp(t *testing.T) { +func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { + // Basic tier emits counts only (fail/warn/pass). Top[] is reserved + // for diagnostic tier — keeps the basic-tier wire footprint minimal. pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} reports := mockPolicyReports{ "Pod/prod/p": { @@ -447,8 +449,8 @@ func TestBuild_PolicyReports_RolledUp(t *testing.T) { if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) } - if len(k.Top) == 0 || k.Top[0].Result != "fail" { - t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + if len(k.Top) != 0 { + t.Errorf("basic tier must NOT emit Top[]; got %d entries: %+v", len(k.Top), k.Top) } gotHint := false for _, h := range rc.Hints { @@ -462,6 +464,38 @@ func TestBuild_PolicyReports_RolledUp(t *testing.T) { } } +func TestBuild_PolicyReports_DiagnosticTierIncludesTop(t *testing.T) { + // Diagnostic tier adds the Top[] findings (capped at 3, ordered + // fail > warn > error > pass). Used by the deep agent investigation + // path — basic tier is for everyday triage. + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierDiagnostic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) == 0 { + t.Fatal("diagnostic tier must emit Top[] findings") + } + if k.Top[0].Result != "fail" { + t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + } +} + func TestBuild_PDB_OutputJSONShape(t *testing.T) { // Pin the wire shape one full populated Build produces, so a future // reorder of fields (or accidental omitempty change) is caught. From 045097cea7491795d67ee17c77b477ae1a8f1ea4 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:12:59 +0300 Subject: [PATCH 04/17] refactor(resourcecontext): consume topology.Relationships.ManagedBy + Pod hygiene fields (T23 dedup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit T23 (#720) made topology.Relationships the canonical projection for server-synthesized ManagedBy (owner-chain + GitOps detection) and Pod hygiene fields (ServiceAccount, Node). Drop the parallel owner-walk and label/annotation scanning inside resourcecontext.Build — read Relationships.{ManagedBy,ServiceAccount,Node} instead. Build now calls topology.GetRelationshipsWithObject when Topology is set, passing the fetched obj so kind/group disambiguation works (Knative serving.knative.dev/Service vs core/v1 Service). Single- resource callers (REST GET /api/ai/resources) pass idx=nil — the per- call O(E) scan is fine for one walk per request; bulk callers should pre-build via topology.IndexByResource(topo) and pass through Options.RelIndex or Options.Relationships. No deprecation aliases: buildManagedBy, parseArgoTrackingID, pickControllerOwner, groupFromAPIVersion, readPair, and the gitops/flux label constants are deleted outright (per saved feedback preference). Their tests are removed — the same logic is exercised in pkg/topology/managedby_test.go. --- internal/server/ai_handlers.go | 8 + pkg/resourcecontext/build.go | 290 ++++++++++++------------------ pkg/resourcecontext/build_test.go | 112 ++++-------- 3 files changed, 154 insertions(+), 256 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 35adde577..3b27b9021 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -292,6 +292,14 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts.Topology = topo opts.Provider = prov opts.DynamicProv = dyn + // Pre-compute Relationships once with the already-fetched obj so + // kind/group disambiguation works (Knative serving.knative.dev/Service + // vs core/v1 Service). idx=nil is fine for single-resource: the + // per-call inline scan is O(E) once. Bulk callers (T12/T89) should + // build a shared index via topology.IndexByResource(topo). + opts.Relationships = topology.GetRelationshipsWithObject( + kind, namespace, name, obj, topo, prov, dyn, nil, + ) } return resourcecontext.Build(r.Context(), obj, opts) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 1eaf9437f..ea69d041f 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -3,7 +3,6 @@ package resourcecontext import ( "context" "sort" - "strings" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -34,11 +33,28 @@ type Options struct { AccessChecker RefAccessChecker // Topology data sources. When Topology is nil, the topology-derived - // fields (Exposes, SelectedBy, ScaledBy) are skipped. + // fields (Exposes, SelectedBy, ScaledBy, ManagedBy, RunsOn, + // Uses.ServiceAccount) are skipped. Topology *topology.Topology Provider topology.ResourceProvider DynamicProv topology.DynamicProvider + // Relationships is the pre-computed per-resource projection. When non-nil, + // Build consumes it directly instead of calling + // topology.GetRelationshipsWithObject — single-resource handlers should + // leave this nil and let Build compute; bulk/list callers that already + // loop over relationships per row SHOULD pass it to avoid double work. + // + // Topology MUST still be set when Relationships is set — synthesis + // helpers (e.g. ManagedBy owner walk) read Topology and RelIndex through + // it. + Relationships *topology.Relationships + + // RelIndex is the topology inverted-edge index. Pass a shared instance + // (topology.IndexByResource(topo)) for high-fanout callers; nil is fine + // for single-resource Build paths — the per-call inline scan is O(E) once. + RelIndex *topology.RelationshipsIndex + // Pre-computed summaries — pass-through into the response. IssueSummary *IssueSummary AuditSummary *AuditSummary @@ -86,18 +102,47 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte rc := &ResourceContext{Tier: opts.Tier} omitted := newOmittedTracker() - // 1. ManagedBy — owner chain + GitOps labels/annotations - rc.ManagedBy = filterRefs( - ctx, opts.AccessChecker, - buildManagedBy(ident), - "managedBy", omitted, - ) + // Topology-derived relationships drive ManagedBy / Exposes / SelectedBy / + // ScaledBy / RunsOn / Uses.ServiceAccount. T23 made + // topology.Relationships the canonical projection: server-side + // SynthesizeManagedBy walks the owner chain + GitOps signals, and the Pod + // hygiene fields (.ServiceAccount, .Node) are populated from pod.Spec. + // We do NOT re-walk owner refs here — that would duplicate the topology + // package's logic and risk drift. + // + // Single-resource callers (REST GET, MCP get_resource) leave + // opts.Relationships nil and let us compute via GetRelationshipsWithObject + // — passing obj keeps kind/group disambiguation correct for CRDs whose + // plural collides with a core resource. Bulk callers that already loop + // over relationships per row pass them in directly. + rel := opts.Relationships + if rel == nil && opts.Topology != nil { + rel = topology.GetRelationshipsWithObject( + ident.Kind, ident.Namespace, ident.Name, obj, + opts.Topology, opts.Provider, opts.DynamicProv, opts.RelIndex, + ) + } + + // 1. ManagedBy — prefer Relationships.ManagedBy (server-synthesized when + // a topology is available; covers GitOps signals + owner-chain walk). + // Fall back to topology.SynthesizeManagedBy with the obj alone when no + // topology is provided — that path still detects Argo/Flux/Helm signals + // from labels and annotations without needing a graph. + var managedBy []topology.ResourceRef + if rel != nil && len(rel.ManagedBy) > 0 { + managedBy = rel.ManagedBy + } else if rel == nil { + if m, ok := obj.(metav1.Object); ok { + managedBy = topology.SynthesizeManagedBy(m, ident.Kind, ident.Namespace, ident.Name, nil, nil, nil) + } + } + if len(managedBy) > 0 { + rc.ManagedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(managedBy, ReasonOwnerReference, SourceOwnerChain), + "managedBy", omitted) + } // 2. Topology-derived: Exposes, SelectedBy, ScaledBy - var rel *topology.Relationships - if opts.Topology != nil { - rel = topology.GetRelationships(ident.Kind, ident.Namespace, ident.Name, opts.Topology, opts.Provider, opts.DynamicProv) - } if rel != nil { exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) exposes = append(exposes, rel.Services...) @@ -120,14 +165,49 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte "scaledBy", omitted) } - // 3. Pod-specific: Uses + RunsOn + // 3. Pod-specific: RunsOn (Node) + Uses (ConfigMap/Secret/PVC/SA). + // + // RunsOn and Uses.ServiceAccount come from topology.Relationships when + // available (T23 populates them from pod.Spec server-side). We still + // scan pod.Spec.Volumes / .EnvFrom directly for the ConfigMap/Secret/PVC + // inventory — topology doesn't model those use-edges at the granularity + // Build needs. if pod, ok := obj.(*corev1.Pod); ok { rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) - if pod.Spec.NodeName != "" { + // Prefer rel.ServiceAccount over re-reading pod.Spec — same source, + // but consolidating through Relationships keeps Build aligned with + // how MCP/agents consume the field. + if rc.Uses != nil && rc.Uses.ServiceAccount == nil && rel != nil && rel.ServiceAccount != nil { + candidate := &ContextRef{ + Kind: rel.ServiceAccount.Kind, + Group: rel.ServiceAccount.Group, + Namespace: rel.ServiceAccount.Namespace, + Name: rel.ServiceAccount.Name, + Reason: ReasonSAName, + Source: SourceK8sSpec, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.Uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + // RunsOn: prefer the topology-supplied Node ref; fall back to + // pod.Spec.NodeName only when topology is absent (no rel). + var nodeName, nodeGroup string + if rel != nil && rel.Node != nil { + nodeName = rel.Node.Name + nodeGroup = rel.Node.Group + } else if rel == nil { + nodeName = pod.Spec.NodeName + } + if nodeName != "" { candidate := &ContextRef{ Kind: "Node", - Name: pod.Spec.NodeName, + Group: nodeGroup, + Name: nodeName, Reason: ReasonNodeName, Source: SourceK8sSpec, } @@ -167,16 +247,14 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // --------------------------------------------------------------------------- // resourceIdentity is the projection of obj that Build needs without holding -// on to the full runtime.Object. Owner refs and labels feed ManagedBy; the -// (Kind, Namespace, Name) tuple keys topology + summary lookups. +// on to the full runtime.Object. The (Kind, Namespace, Name) tuple keys +// topology relationship lookups and summary lookups; Group is retained for +// future use by callers inspecting the identity directly. type resourceIdentity struct { - Kind string - Group string - Namespace string - Name string - Labels map[string]string - Annotations map[string]string - Owners []metav1.OwnerReference + Kind string + Group string + Namespace string + Name string } // identityOf extracts identity from a typed K8s object or unstructured. @@ -241,13 +319,10 @@ func identityOf(obj runtime.Object) (resourceIdentity, bool) { case *unstructured.Unstructured: gvk := v.GroupVersionKind() return resourceIdentity{ - Kind: gvk.Kind, - Group: gvk.Group, - Namespace: v.GetNamespace(), - Name: v.GetName(), - Labels: v.GetLabels(), - Annotations: v.GetAnnotations(), - Owners: v.GetOwnerReferences(), + Kind: gvk.Kind, + Group: gvk.Group, + Namespace: v.GetNamespace(), + Name: v.GetName(), }, true } return resourceIdentity{}, false @@ -255,156 +330,11 @@ func identityOf(obj runtime.Object) (resourceIdentity, bool) { func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { return resourceIdentity{ - Kind: kind, - Group: group, - Namespace: m.Namespace, - Name: m.Name, - Labels: m.Labels, - Annotations: m.Annotations, - Owners: m.OwnerReferences, - } -} - -// --------------------------------------------------------------------------- -// ManagedBy detection -// --------------------------------------------------------------------------- - -// GitOps label/annotation keys — kept in sync with packages/k8s-ui/src/utils/gitops-owner.ts. -const ( - argoTrackingIDAnnotation = "argocd.argoproj.io/tracking-id" - argoInstanceLabel = "argocd.argoproj.io/instance" - fluxKustomizeNameLabel = "kustomize.toolkit.fluxcd.io/name" - fluxKustomizeNSLabel = "kustomize.toolkit.fluxcd.io/namespace" - fluxHelmNameLabel = "helm.toolkit.fluxcd.io/name" - fluxHelmNSLabel = "helm.toolkit.fluxcd.io/namespace" -) - -// buildManagedBy returns the ContextRefs describing what manages this -// resource. Precedence (most-specific wins): -// 1. Flux HelmRelease labels -// 2. Flux Kustomization labels -// 3. Argo tracking-id annotation -// 4. Argo instance label -// 5. First owner reference (controller=true preferred) -// -// Only one path emits today — the field is a slice so future taxonomies -// (e.g. dual ArgoCD + Flux) can list multiple managers without a wire change. -func buildManagedBy(ident resourceIdentity) []ContextRef { - if name, ns, ok := readPair(ident.Labels, fluxHelmNameLabel, fluxHelmNSLabel); ok { - return []ContextRef{{ - Kind: "HelmRelease", - Group: "helm.toolkit.fluxcd.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - if name, ns, ok := readPair(ident.Labels, fluxKustomizeNameLabel, fluxKustomizeNSLabel); ok { - return []ContextRef{{ - Kind: "Kustomization", - Group: "kustomize.toolkit.fluxcd.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - if id := ident.Annotations[argoTrackingIDAnnotation]; id != "" { - if ns, name, ok := parseArgoTrackingID(id); ok && name != "" { - return []ContextRef{{ - Kind: "Application", - Group: "argoproj.io", - Namespace: ns, - Name: name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - } - if inst := ident.Labels[argoInstanceLabel]; inst != "" { - // App namespace unknown without tracking-id — emit with empty ns - // like the UI does; the consumer decides whether to navigate. - return []ContextRef{{ - Kind: "Application", - Group: "argoproj.io", - Name: inst, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - - if owner := pickControllerOwner(ident.Owners); owner != nil { - group := groupFromAPIVersion(owner.APIVersion) - return []ContextRef{{ - Kind: owner.Kind, - Group: group, - Namespace: ident.Namespace, - Name: owner.Name, - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, - }} - } - return nil -} - -func readPair(m map[string]string, k1, k2 string) (string, string, bool) { - a := m[k1] - b := m[k2] - if a == "" || b == "" { - return "", "", false - } - return a, b, true -} - -// parseArgoTrackingID mirrors gitops-owner.ts. Two forms: -// -// ":..." (legacy, single name) -// "_:..." (namespaced install) -// -// Returns (ns, name, ok). -func parseArgoTrackingID(value string) (string, string, bool) { - colon := strings.IndexByte(value, ':') - if colon < 0 { - return "", "", false - } - head := value[:colon] - if head == "" { - return "", "", false - } - if sep := strings.IndexByte(head, '_'); sep >= 0 { - ns := head[:sep] - name := head[sep+1:] - if name == "" { - return "", "", false - } - return ns, name, true - } - return "", head, true -} - -// pickControllerOwner returns the first owner with Controller=true; falls -// back to the first owner if none are marked controller. Returns nil when -// the slice is empty. -func pickControllerOwner(owners []metav1.OwnerReference) *metav1.OwnerReference { - for i := range owners { - if owners[i].Controller != nil && *owners[i].Controller { - return &owners[i] - } - } - if len(owners) > 0 { - return &owners[0] - } - return nil -} - -// groupFromAPIVersion extracts the group from "group/version" or "version" -// (core/v1 form). Mirrors schema.ParseGroupVersion without the import. -func groupFromAPIVersion(apiVersion string) string { - if i := strings.IndexByte(apiVersion, '/'); i >= 0 { - return apiVersion[:i] + Kind: kind, + Group: group, + Namespace: m.Namespace, + Name: m.Name, } - return "" } // --------------------------------------------------------------------------- diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index bfdccccf8..3a71b81ec 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -59,7 +59,7 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { "app.kubernetes.io/name": "web", }, Annotations: map[string]string{ - argoTrackingIDAnnotation: "argocd_storefront:apps/Deployment:prod/web", + "argocd.argoproj.io/tracking-id": "argocd_storefront:apps/Deployment:prod/web", }, OwnerReferences: []metav1.OwnerReference{ {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "web-7d", Controller: ptrBool(true)}, @@ -234,8 +234,8 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { Name: "web", Namespace: "prod", Labels: map[string]string{ - fluxHelmNameLabel: "web", - fluxHelmNSLabel: "flux-system", + "helm.toolkit.fluxcd.io/name": "web", + "helm.toolkit.fluxcd.io/namespace": "flux-system", }, }, } @@ -328,8 +328,10 @@ func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { } func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { - // A ConfigMap with a controller owner reference. No topology, no Pod - // spec — just owner-chain ManagedBy. + // A ConfigMap owned by a Deployment via EdgeManages — owner-chain + // ManagedBy is sourced from topology.SynthesizeManagedBy walking the + // owner graph (T23 canonical projection). No Pod spec, no GitOps + // labels — just the topology owner edge. cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "web-config", @@ -339,16 +341,26 @@ func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { }, }, } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "configmap/prod/web-config", Kind: topology.KindConfigMap, Name: "web-config"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "deployment/prod/web", Target: "configmap/prod/web-config", Type: topology.EdgeManages}, + }, + } rc := Build(context.Background(), cm, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, + Topology: topo, EmitHints: true, }) if got, want := len(rc.ManagedBy), 1; got != want { t.Fatalf("ManagedBy len: got %d want %d", got, want) } mb := rc.ManagedBy[0] - if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" || mb.Group != "apps" { + if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" { t.Errorf("ManagedBy[0]: got %+v", mb) } } @@ -388,9 +400,14 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { } func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { + // Flux Helm labels — detected from obj metadata directly via + // topology.SynthesizeManagedBy without needing a populated Topology. dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", - OwnerReferences: []metav1.OwnerReference{{Kind: "Foo", APIVersion: "ex.io/v1", Name: "f", Controller: ptrBool(true)}}}, + Labels: map[string]string{ + "helm.toolkit.fluxcd.io/name": "web", + "helm.toolkit.fluxcd.io/namespace": "flux-system", + }}, } rc := Build(context.Background(), dep, Options{ Tier: TierBasic, @@ -508,9 +525,21 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { }, Spec: corev1.PodSpec{NodeName: "n1"}, } + // Topology with the owner edge so SynthesizeManagedBy can walk the + // chain and emit a ReplicaSet ManagedBy ref for wire-shape coverage. + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/p", Kind: topology.KindPod, Name: "p"}, + {ID: "replicaset/prod/rs", Kind: topology.KindReplicaSet, Name: "rs"}, + }, + Edges: []topology.Edge{ + {Source: "replicaset/prod/rs", Target: "pod/prod/p", Type: topology.EdgeManages}, + }, + } rc := Build(context.Background(), pod, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, + Topology: topo, EmitHints: true, }) b, err := json.MarshalIndent(rc, "", " ") @@ -530,75 +559,6 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { } } -// --------------------------------------------------------------------------- -// Sub-helpers' unit coverage -// --------------------------------------------------------------------------- - -func TestParseArgoTrackingID(t *testing.T) { - cases := []struct { - in string - wantNS string - wantName string - wantOK bool - shortName string - }{ - {"argocd_store:apps/Deployment:prod/web", "argocd", "store", true, "namespaced form"}, - {"store:apps/Deployment:prod/web", "", "store", true, "legacy form"}, - {"", "", "", false, "empty"}, - {":foo/bar", "", "", false, "missing head"}, - {"a_:foo", "", "", false, "missing name"}, - } - for _, c := range cases { - t.Run(c.shortName, func(t *testing.T) { - ns, name, ok := parseArgoTrackingID(c.in) - if ns != c.wantNS || name != c.wantName || ok != c.wantOK { - t.Errorf("parseArgoTrackingID(%q) = (%q, %q, %v) want (%q, %q, %v)", - c.in, ns, name, ok, c.wantNS, c.wantName, c.wantOK) - } - }) - } -} - -func TestGroupFromAPIVersion(t *testing.T) { - cases := map[string]string{ - "v1": "", - "apps/v1": "apps", - "argoproj.io/v1alpha1": "argoproj.io", - "networking.k8s.io/v1": "networking.k8s.io", - "helm.toolkit.fluxcd.io/v2beta1": "helm.toolkit.fluxcd.io", - } - for in, want := range cases { - if got := groupFromAPIVersion(in); got != want { - t.Errorf("groupFromAPIVersion(%q) = %q, want %q", in, got, want) - } - } -} - -func TestPickControllerOwner_PrefersController(t *testing.T) { - owners := []metav1.OwnerReference{ - {Kind: "Other", Name: "x"}, - {Kind: "Boss", Name: "ctrl", Controller: ptrBool(true)}, - } - got := pickControllerOwner(owners) - if got == nil || got.Name != "ctrl" { - t.Errorf("got %+v, want ctrl", got) - } -} - -func TestPickControllerOwner_FallsBackToFirst(t *testing.T) { - owners := []metav1.OwnerReference{ - {Kind: "Solo", Name: "first"}, - {Kind: "Other", Name: "x"}, - } - got := pickControllerOwner(owners) - if got == nil || got.Name != "first" { - t.Errorf("got %+v, want first", got) - } - if got := pickControllerOwner(nil); got != nil { - t.Errorf("nil owners should return nil, got %+v", got) - } -} - // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- From 99c878c5c135fb5b5db16856c7f56177740ab112 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 12:30:53 +0300 Subject: [PATCH 05/17] fix(security): RBAC preflight on /api/ai/resources GET handler handleAIGetResource skipped the user-RBAC gates that handleGetResource runs, so a user with namespace access but no per-namespace get-secrets SAR could read Secret values via the AI endpoint, and a user without cluster-scoped node SAR could read Node objects. The AI surface returns the same resource bytes (just minified + wrapped) as the REST surface, so it must enforce the same gates. Extract the gate block into Server.preflightResourceGet and call it from both handlers. Single helper keeps the two endpoints in lockstep so future RBAC adjustments touch one place. Tests cover the three deny arms (per-ns get-secrets, cluster-scoped get-node, namespace access) plus a passing-case sanity check on the AI envelope shape. --- internal/server/ai_handlers.go | 11 +++ internal/server/ai_handlers_rbac_test.go | 101 +++++++++++++++++++++++ internal/server/server.go | 82 +++++++++++------- 3 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 internal/server/ai_handlers_rbac_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 3b27b9021..e8eff3284 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -182,6 +182,17 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { namespace = "" } + // Run the same RBAC preflight as handleGetResource — the AI endpoint + // returns the same resource bytes (just minified) and must gate on the + // same per-user SAR / namespace-access tuple. Without this, a user with + // no `get secrets` SAR could read Secret values via /api/ai/resources/… + // even though /api/resources/… correctly returns 403. Runs BEFORE the + // fetch so cluster-scoped denies don't leak existence by status code. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go new file mode 100644 index 000000000..0279bf243 --- /dev/null +++ b/internal/server/ai_handlers_rbac_test.go @@ -0,0 +1,101 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + + "github.com/skyhook-io/radar/internal/auth" +) + +// RBAC preflight on /api/ai/resources/{kind}/{namespace}/{name}. +// +// The AI single-resource GET returns the same resource bytes (just minified +// + wrapped in a resourceContext block) as /api/resources/{kind}/{ns}/{name}. +// It must therefore enforce the same per-user RBAC gates that +// handleGetResource enforces — otherwise a user could read Secret values via +// the AI surface even when the REST surface correctly returns 403. +// +// Both handlers call s.preflightResourceGet, so these tests pin the AI +// endpoint's gates (and a regression that bypasses the helper on the AI side +// would surface here even if the REST tests still pass). + +func TestProxyAuth_AIGetSecret_PerNamespaceRBAC_Denied(t *testing.T) { + // alice has namespace access to "default" but the per-namespace + // canRead("","secrets","default","get") returns false. The cache holds + // nginx-tls (seeded as the SA which has cluster-wide secrets RBAC), + // so without the preflight a 200 would leak secret bytes. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretGetCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secret/default/nginx-tls", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-secret without per-ns get SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetNode_ClusterScopedRBAC_Denied(t *testing.T) { + // Node is cluster-scoped — the AI GET must require per-kind get-node SAR. + // AllowedNamespaces==nil (cluster-wide-namespace sentinel) is NOT a + // license to read cluster-scoped kinds: that's the exact conflation the + // preflight helper guards against. A regression that dropped the + // ClassifyKindScope arm would let nodes through here. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("get", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/node/_/worker-1", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-node without cluster-scoped get-node SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceDenied(t *testing.T) { + // alice has namespace access only to "default" — a get against a pod + // in "kube-system" must 403 BEFORE any fetch, matching handleGetResource. + // A regression that fetched first and then filtered would let timing + // signal whether the pod exists (oracle). + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/kube-system/some-pod", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-pod in disallowed namespace, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceAllowed(t *testing.T) { + // Sanity check: a user with namespace access AND who hits an existing + // resource gets a 200 with the {resource, resourceContext} envelope. + // Pins that the preflight isn't accidentally over-gating happy-path + // requests (e.g., a misordered check that always denies). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/default/nginx-abc-xyz", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200 on allowed AI get-pod, got %d", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + if _, ok := body["resource"]; !ok { + t.Errorf("expected 'resource' field in AI get response, got: %+v", body) + } + if _, ok := body["resourceContext"]; !ok { + t.Errorf("expected 'resourceContext' field in AI get response, got: %+v", body) + } +} diff --git a/internal/server/server.go b/internal/server/server.go index b4e350187..23e81e941 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1470,29 +1470,23 @@ func setTypeMeta(resource any) { k8s.SetTypeMeta(resource) } -func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - namespace := chi.URLParam(r, "namespace") - name := chi.URLParam(r, "name") - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace - if namespace == "_" { - namespace = "" - } - - // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are - // gated per-kind via SAR. Run BEFORE the namespace access check so - // users with explicit cluster-scoped RBAC but no namespace access can - // still get the resource. ClassifyKindScope catches both static cluster- - // only kinds and dynamic cluster-scoped CRDs (via discovery). - // - // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user - // filtered list — gate the GET via the user's namespace access for the - // requested name, not via cluster-scoped SAR. +// preflightResourceGet runs the per-user RBAC gates that must pass before any +// single-resource GET fetch. Mirrors the kind/scope-aware logic used by both +// the REST handler (handleGetResource) and the AI handler (handleAIGetResource) +// so future RBAC adjustments stay in lockstep across both surfaces. +// +// Inputs are the already-normalized (kind, namespace, name, group); callers +// must collapse the cluster-scoped "_" placeholder before calling. Returns +// (status, message, ok=true) when the request passes the gates, or +// (status, message, ok=false) with the HTTP status + body the caller should +// emit on deny. +// +// Three gates, run in this order: +// 1. kind == "namespaces" → full Namespace object requires get-namespaces SAR +// 2. cluster-scoped (Node/CRD/…) → per-kind get SAR (ClassifyKindScope) +// 3. namespaced → namespace access via getUserNamespaces, +// plus per-namespace get SAR for Secrets +func (s *Server) preflightResourceGet(r *http.Request, kind, namespace, name, group string) (int, string, bool) { isNamespacesKind := kind == "namespaces" || kind == "namespace" isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) switch { @@ -1502,30 +1496,56 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { // imply read access to the Namespace object itself. Restricted users // without ClusterRole on namespaces get 403 here. if !s.canRead(r, "", "namespaces", "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name), false } case isClusterScoped: if !s.canRead(r, gvrGroup, gvrResource, "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind)) - return + return http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind), false } case namespace != "": // Namespaced kind: verify namespace access. allowed := s.getUserNamespaces(r, []string{namespace}) if noNamespaceAccess(allowed) { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace), false } // Per-kind RBAC inside the namespace for Secrets — the chart can // grant the SA cluster-wide secrets (Helm release visibility), so // namespace-list discovery is not a sufficient gate here. The list // handler has the matching list-SAR. if (kind == "secrets" || kind == "secret") && !s.canRead(r, "", "secrets", namespace, "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace), false } } + return 0, "", true +} + +func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace + if namespace == "_" { + namespace = "" + } + + // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are + // gated per-kind via SAR. Run BEFORE the namespace access check so + // users with explicit cluster-scoped RBAC but no namespace access can + // still get the resource. ClassifyKindScope catches both static cluster- + // only kinds and dynamic cluster-scoped CRDs (via discovery). + // + // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user + // filtered list — gate the GET via the user's namespace access for the + // requested name, not via cluster-scoped SAR. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } cache := k8s.GetResourceCache() if cache == nil { From d0a91c7ee58534c91e2754c37299ab93b00726ae Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:03:40 +0300 Subject: [PATCH 06/17] fix(resourcecontext): audit cross-Kind contamination + RunsOn fallback + selectedBy mislabel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer's critical (#721): computeAuditSummaryForResource dropped the Kind from its signature and matched only (namespace, name) by iterating the audit index. A Deployment "web" in "prod" silently inherited audit findings from a Service or ConfigMap of the same name in the same namespace. The loop's map-iteration order also made TopFinding non-deterministic across runs for tied severities — breaking the determinism guarantee SynthesizeHints downstream advertises. Fix derives the canonical kind from obj's TypeMeta in buildAIResourceContext (Pascal singular — exactly what audit's check runner writes into Finding.Kind) and threads it into the audit lookup. The lookup is now a single map access via bpaudit.ResourceKey instead of an O(n) scan. TopFinding selection sorts by (severity desc, CheckID asc) so ties resolve identically every run. Important fixes also addressed: - RunsOn fallback gap (pkg/resourcecontext/build.go:197): the `else if rel == nil` guard meant pod.Spec.NodeName was used only when there was no topology at all. When topology was present but rel.Node was nil (Node informer cold, node not yet indexed), RunsOn stayed empty even though the Pod spec named a node. Now falls back any time rel.Node is missing. - selectedByHint mislabel (pkg/resourcecontext/hints.go:124): every non-PDB ref was rendered as NetworkPolicy, including any future kind added to SelectedBy. Now explicit-match each known kind (PodDisruptionBudget, NetworkPolicy) and drop unrecognized kinds through summarizeKindsCounts instead of mislabeling. --- internal/server/ai_handlers.go | 58 +++++++++++++++++++++------------- pkg/resourcecontext/build.go | 11 +++++-- pkg/resourcecontext/hints.go | 21 ++++++++---- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index e8eff3284..a6afc6231 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net/http" + "sort" "strings" "github.com/go-chi/chi/v5" @@ -281,8 +282,18 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin } cache := k8s.GetResourceCache() + // Canonical kind from the resource's own TypeMeta (set at fetch). Pascal + // singular — matches what the audit check runner writes into Finding.Kind, + // so the audit index lookup keys correctly. Falls back to the URL kind + // only when TypeMeta is somehow empty; non-canonical input there would + // silently mis-key the audit lookup. + canonicalKind := obj.GetObjectKind().GroupVersionKind().Kind + if canonicalKind == "" { + canonicalKind = kind + } + issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) - auditSum := computeAuditSummaryForResource(cache, namespace, name) + auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, @@ -411,11 +422,19 @@ func composeSeverityRank(s issues.Severity) int { } // computeAuditSummaryForResource looks up audit findings for the subject -// resource. Uses pkg/audit.IndexByResource so the lookup is keyed on the -// canonical (Kind/ns/name) tuple — handles plural→singular normalization -// via the Finding.Kind values written by the check runner. -func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name string) *resourcecontext.AuditSummary { - if cache == nil { +// resource via the canonical (Kind/ns/name) tuple. kind MUST be the Pascal +// singular form the audit check runner writes into Finding.Kind (e.g. "Pod", +// not "pod" or "pods") — the caller derives it from obj's TypeMeta. Without +// a Kind-aware key, a Deployment "web" in "prod" would inherit findings +// from a Service "web" in the same namespace, since map iteration in the +// previous implementation only compared (namespace, name). +// +// TopFinding is selected deterministically: highest severity wins, with +// CheckID as the ascending tiebreaker. Map iteration ordering does NOT +// influence the choice — relevant because SynthesizeHints downstream +// advertises deterministic output. +func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil || kind == "" { return nil } results := audit.RunFromCache(cache, []string{namespace}, nil) @@ -423,27 +442,22 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, namespace, name st return nil } idx := bpaudit.IndexByResource(results.Findings) - var match []bpaudit.Finding - for key, fs := range idx { - parts := strings.SplitN(key, "/", 3) - if len(parts) != 3 { - continue - } - if parts[1] == namespace && parts[2] == name { - match = append(match, fs...) - } - } + match := idx[bpaudit.ResourceKey(kind, namespace, name)] if len(match) == 0 { return nil } - var topSeverity, topFinding string - for _, f := range match { - if topSeverity == "" || auditSeverityRank(f.Severity) > auditSeverityRank(topSeverity) { - topSeverity = f.Severity - topFinding = f.CheckID + // Sort by (severity desc, CheckID asc) so TopFinding is deterministic + // across runs even when multiple findings tie on severity. + sort.Slice(match, func(i, j int) bool { + ri, rj := auditSeverityRank(match[i].Severity), auditSeverityRank(match[j].Severity) + if ri != rj { + return ri > rj } - } + return match[i].CheckID < match[j].CheckID + }) + topSeverity := match[0].Severity + topFinding := match[0].CheckID return &resourcecontext.AuditSummary{ Count: len(match), HighestSeverity: topSeverity, diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index ea69d041f..e8ef0f93a 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -194,13 +194,18 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } } - // RunsOn: prefer the topology-supplied Node ref; fall back to - // pod.Spec.NodeName only when topology is absent (no rel). + // RunsOn: prefer the topology-supplied Node ref. Fall back to + // pod.Spec.NodeName any time rel.Node is empty — the Node informer + // may be cold, the node may not yet be in the topology graph, or + // rel itself may be nil. The previous `else if rel == nil` guard + // dropped the fallback when topology was built but rel.Node hadn't + // been populated yet, leaving RunsOn empty even though the Pod + // spec clearly named a node. var nodeName, nodeGroup string if rel != nil && rel.Node != nil { nodeName = rel.Node.Name nodeGroup = rel.Node.Group - } else if rel == nil { + } else { nodeName = pod.Spec.NodeName } if nodeName != "" { diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go index 30ccd698b..a385f455f 100644 --- a/pkg/resourcecontext/hints.go +++ b/pkg/resourcecontext/hints.go @@ -125,23 +125,32 @@ func selectedByHint(refs []ContextRef) string { if len(refs) == 0 { return "" } - // Distinguish PDB vs NetworkPolicy in the hint — they read very - // differently to a human, and lumping them together loses signal. - var pdb, np []ContextRef + // Distinguish known SelectedBy kinds (PDB vs NetworkPolicy) in the hint — + // they read very differently to a human, and lumping them together loses + // signal. Match each kind explicitly: a future kind added to SelectedBy + // (e.g. ValidatingAdmissionPolicy) would otherwise be silently rendered + // as NetworkPolicy. Unrecognized kinds drop through to summarizeKindsCounts. + var pdb, np, other []ContextRef for _, r := range refs { - if r.Kind == "PodDisruptionBudget" { + switch r.Kind { + case "PodDisruptionBudget": pdb = append(pdb, r) - } else { + case "NetworkPolicy": np = append(np, r) + default: + other = append(other, r) } } - parts := make([]string, 0, 2) + parts := make([]string, 0, 3) if n := len(np); n > 0 { parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) } if n := len(pdb); n > 0 { parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) } + if len(other) > 0 { + parts = append(parts, summarizeKindsCounts(other)) + } return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) } From 74ff1d561caf202b673390bbe39ebf7b99686506 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 16:45:25 +0300 Subject: [PATCH 07/17] fix(security): route group-qualified AI GET to dynamic cache to avoid kind collisions fetchAIResource was calling FetchResource (typed cache) before consulting the group qualifier. For /api/ai/resources/services/ns/name?group= serving.knative.dev, the typed cache returns the core/v1 Service and the ?group= is silently dropped, leaking the wrong object via the AI surface. Branch on group != "" first and route directly to GetDynamicWithGroup, mirroring handleGetResource's dispatch in server.go. The typed-first path remains correct (and faster) when no group is passed. Same bug class as T12's group-blind root lookup, on the single-resource GET path. --- internal/k8s/testing.go | 52 ++++++++ internal/server/ai_handlers.go | 16 +++ internal/server/ai_handlers_group_test.go | 149 ++++++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 internal/server/ai_handlers_group_test.go diff --git a/internal/k8s/testing.go b/internal/k8s/testing.go index 799b2811e..560cbf632 100644 --- a/internal/k8s/testing.go +++ b/internal/k8s/testing.go @@ -4,7 +4,9 @@ import ( "sync" "github.com/skyhook-io/radar/pkg/k8score" + "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + fakeclientset "k8s.io/client-go/kubernetes/fake" ) // InitTestResourceCache creates a resource cache from a fake or test client, @@ -68,6 +70,56 @@ func InitTestResourceCache(client kubernetes.Interface) error { return nil } +// InitTestDynamicResourceCache wires the dynamic resource cache and discovery +// singletons against test fakes. Pass a dynamic client (typically from +// dynamicfake.NewSimpleDynamicClientWithCustomListKinds) and the set of +// APIResources to register in discovery. Each registered resource gets a GVR +// entry that group-qualified lookups (GetGVRWithGroup) and dynamic informers +// can resolve. +// +// Callers should defer ResetTestDynamicState — without it, the dynamic +// singletons leak into other tests that share TestMain state. +// +// This is intended for integration tests only. +func InitTestDynamicResourceCache(dynClient dynamic.Interface, resources []APIResource) error { + clientMu.Lock() + dynamicClient = dynClient + clientMu.Unlock() + + // Bootstrap discovery from a fake clientset so NewResourceDiscovery has a + // non-nil discovery client; AddAPIResource then registers the test-only + // GVRs (e.g. serving.knative.dev/Service) the test depends on. + fakeDisc := fakeclientset.NewSimpleClientset().Discovery() + core, err := k8score.NewResourceDiscovery(fakeDisc) + if err != nil { + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() + return err + } + for _, r := range resources { + core.AddAPIResource(r) + } + + discoveryMu.Lock() + resourceDiscovery = &ResourceDiscovery{ResourceDiscovery: core} + discoveryOnce = new(sync.Once) + discoveryOnce.Do(func() {}) + discoveryMu.Unlock() + + return InitDynamicResourceCache(nil) +} + +// ResetTestDynamicState tears down the dynamic cache + discovery singletons +// and clears the dynamic client. Pairs with InitTestDynamicResourceCache. +func ResetTestDynamicState() { + ResetDynamicResourceCache() + ResetResourceDiscovery() + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() +} + // SetTestContextName is a test-only helper that overrides the package-level // kubeconfig context name. Used by tests that exercise per-context state // (e.g. namespace preferences) without needing to spin up a real client. diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index a6afc6231..6ebab36f3 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -230,7 +230,23 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { // fetchAIResource resolves the resource from the typed cache or dynamic cache. // The bool reports whether the returned object is an unstructured (CRD) value. +// +// When a group is provided, the typed cache is skipped entirely and the +// dynamic cache is consulted with the group qualifier. This prevents kind +// collisions where a CRD plural shadows a core kind (e.g., Knative +// serving.knative.dev/Service vs core/v1 Service): without this branch, +// FetchResource("services", ...) would return the core Service from the +// typed informer and the requested group would never be consulted, leaking +// the wrong object via the AI surface. Mirrors handleGetResource's +// group-first dispatch in server.go. func (s *Server) fetchAIResource(ctx context.Context, cache *k8s.ResourceCache, kind, namespace, name, group string) (runtime.Object, bool, error) { + if group != "" { + u, err := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if err != nil { + return nil, false, err + } + return u, true, nil + } obj, err := k8s.FetchResource(cache, kind, namespace, name) if err == nil { return obj, false, nil diff --git a/internal/server/ai_handlers_group_test.go b/internal/server/ai_handlers_group_test.go new file mode 100644 index 000000000..da8891066 --- /dev/null +++ b/internal/server/ai_handlers_group_test.go @@ -0,0 +1,149 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynamicfake "k8s.io/client-go/dynamic/fake" + + "github.com/skyhook-io/radar/internal/k8s" +) + +// Group-qualified AI GET must route to the dynamic cache so CRDs whose +// plural shadows a core kind (Knative serving.knative.dev/Service vs +// core/v1 Service) resolve to the requested object — not whichever the +// typed cache happens to hold under that kind/name pair. +// +// Without the group-first branch in fetchAIResource, FetchResource( +// "services", ...) returns the core/v1 Service from the typed informer +// and ?group=serving.knative.dev is silently dropped. The bug surfaces +// as wrong-object disclosure on the AI surface: a caller asking for the +// Knative Service receives the core Service's spec + IP + selector +// instead. This pins the fix and would regress if the typed cache is +// consulted before the group qualifier. +// +// Same bug class as T12's group-blind root lookup, but on the single- +// resource GET path; ResourceContext relationship walks already disambig +// by group (see pkg/topology/managedby_test.go), so a regression here is +// the last remaining hot spot for kind/plural collisions on the GET API. +func TestAIGetResource_GroupRoutesToDynamic(t *testing.T) { + // Seed a Knative Service named "nginx" in "default" — same name+ns as + // the core Service registered in TestMain. Without ?group routing, the + // typed cache wins and returns the core Service. With it, the dynamic + // cache returns the Knative Service. + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + // Warm the informer so the Get() call below sees the seeded object + // without racing on initial sync. + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev&context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + // context=none returns the minified resource directly (no envelope). + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "serving.knative.dev/v1" { + t.Fatalf("apiVersion = %q, want serving.knative.dev/v1 — group qualifier was ignored "+ + "and the typed cache's core Service was returned instead", apiVersion) + } + kind, _ := body["kind"].(string) + if kind != "Service" { + t.Errorf("kind = %q, want Service", kind) + } + // Cross-check: the core Service has a Spec.Selector / ClusterIP shape + // that the Knative seed does NOT have. A regression that returned the + // core Service would carry those fields here. + spec, _ := body["spec"].(map[string]any) + if _, hasSelector := spec["selector"]; hasSelector { + t.Errorf("response carries Service.spec.selector — looks like the core Service leaked through "+ + "despite ?group=serving.knative.dev; body=%+v", body) + } +} + +// Happy-path sibling for the test above: when no group is passed, the +// typed-cache-first path is correct (and must continue to be — the v1 +// core Service is the dominant case and must not pay a dynamic-cache +// detour just because the group-qualified branch was added). +func TestAIGetResource_NoGroupHitsTypedCache(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "v1" { + t.Fatalf("apiVersion = %q, want v1 (core Service) on no-group request", apiVersion) + } +} From 322d386ce5b7f857c938c8fc670d225364ef723d Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Sun, 17 May 2026 17:38:22 +0300 Subject: [PATCH 08/17] fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two residual bugs on the AI GET resourceContext path were dropping signal silently: 1) computeIssueSummaryForResource was called with the URL-plural kind ("deployments"), which the issues composer's Filters.Kinds matcher case-folds but does NOT plural-to-singular convert. Issue.Kind is the canonical Pascal singular ("Deployment"), so every issue got filtered out and IssueSummary.Count silently collapsed to 0 (Build then omits the field entirely). Fix: pass canonicalKind (derived from obj.GVK) into the function — same pattern computeAuditSummaryForResource was already using. 2) GetRelationshipsWithObject was called with the URL-plural kind, which buildNodeID resolved to the wrong topology node for cross-group CRDs whose Kind collides with a core kind. For a Knative serving.knative.dev Service request, "services" → "service/ns/name" picked the CORE Service node instead of "knativeservice/ns/name", so relationship walks returned the core Service's edges. Same shape for CAPI Cluster ("capicluster") and Istio Gateway ("istiogateway"). Added topology.KindForGVK as a small exported helper that maps (kind, group) → pseudo-kind for the three known cross-group collisions; the handler now funnels gvk through it before the relationship lookup. Non-colliding kinds (core, apps, batch, Gateway API, etc.) pass through unchanged so buildNodeID's existing kindMap handles them. Tests: - pkg/topology/pseudokinds_test.go: table tests covering every remap case plus the pass-throughs (including the wrong-group-same-kind guards: Service under argoproj.io, Route under route.openshift.io). - internal/server/ai_handlers_group_test.go: * TestAIGetResource_GroupRoutesRelationshipsToKnative — seeds a Knative Service in the dynamic cache co-named with the core Service plus an Ingress backend-ref'd to the core Service. Asserts the knative-routed response does NOT leak the core Service's Ingress into resourceContext.exposes (locked the regression to fail without the KindForGVK funnel). * TestAIGetResource_IssueSummaryCountsURLPluralKind — asserts a broken Deployment (UnavailableReplicas=3) surfaces with count > 0 in resourceContext.issueSummary when fetched via the URL plural. Fixture additions in TestMain: - broken/stuck-app Deployment (UnavailableReplicas=3): seeds a known problem for the issue-summary regression test, in its own namespace so it doesn't perturb default-namespace smoke tests. - default/nginx-ingress Ingress routing to Service "nginx": the distinguishing edge between core Service and Knative Service topology nodes. --- internal/server/ai_handlers.go | 20 ++- internal/server/ai_handlers_group_test.go | 183 ++++++++++++++++++++++ internal/server/server_smoke_test.go | 67 ++++++++ pkg/topology/pseudokinds.go | 48 ++++++ pkg/topology/pseudokinds_test.go | 61 ++++++++ 5 files changed, 377 insertions(+), 2 deletions(-) create mode 100644 pkg/topology/pseudokinds.go create mode 100644 pkg/topology/pseudokinds_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ebab36f3..f359904cd 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -308,7 +308,7 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin canonicalKind = kind } - issueSum := computeIssueSummaryForResource(cache, kind, namespace, name) + issueSum := computeIssueSummaryForResource(cache, canonicalKind, namespace, name) auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) opts := resourcecontext.Options{ @@ -335,8 +335,17 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin // vs core/v1 Service). idx=nil is fine for single-resource: the // per-call inline scan is O(E) once. Bulk callers (T12/T89) should // build a shared index via topology.IndexByResource(topo). + // + // Route through KindForGVK so cross-group CRDs whose Kind collides + // with a core kind (Knative {Service, Configuration, Revision, Route}, + // CAPI Cluster, Istio Gateway) resolve to the right topology node. + // The builder writes those as pseudo-kinds (e.g. "knativeservice/...") + // — without remapping, buildNodeID would resolve "services/..." to + // the core Service node and walk the wrong edges. + gvk := obj.GetObjectKind().GroupVersionKind() + relKind := topology.KindForGVK(gvk.Kind, gvk.Group) opts.Relationships = topology.GetRelationshipsWithObject( - kind, namespace, name, obj, topo, prov, dyn, nil, + relKind, namespace, name, obj, topo, prov, dyn, nil, ) } @@ -379,6 +388,13 @@ func (s *Server) topologyForContext(namespace string) (*topology.Topology, topol // is done client-side; the composer's native namespace filter restricts the // scan to the resource's namespace so we don't walk the whole cluster. // +// kind MUST be the Pascal singular form the issue composer writes into +// Issue.Kind (e.g. "Deployment", "Pod") — the caller derives it from obj's +// TypeMeta. The composer's Filters.Kinds matcher case-folds both sides, but +// it does NOT plural-to-singular convert, so URL forms ("deployments", +// "pods") drop every issue ("deployments" != lower("Deployment")) and the +// summary silently collapses to nil. +// // Returns nil when no issues match — Build then omits the IssueSummary field. func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { if cache == nil { diff --git a/internal/server/ai_handlers_group_test.go b/internal/server/ai_handlers_group_test.go index da8891066..b6e0aabbe 100644 --- a/internal/server/ai_handlers_group_test.go +++ b/internal/server/ai_handlers_group_test.go @@ -124,6 +124,189 @@ func TestAIGetResource_GroupRoutesToDynamic(t *testing.T) { } } +// Group-qualified AI GET must also route the topology relationship lookup +// to the matching pseudo-kind node. The bug: handleAIGetResource passed the +// URL plural "services" straight into topology.GetRelationshipsWithObject, +// which feeds buildNodeID — and buildNodeID's kindMap resolves "services" +// to "service", landing on the CORE Service's topology node. For a Knative +// Service request, the response then carried the core Service's incoming +// Ingress edge as resourceContext.exposes, which is provably wrong. +// +// Fix: derive a topology-pseudo-kind via topology.KindForGVK(gvk.Kind, +// gvk.Group) — for Knative Service, that yields "knativeservice", whose +// node has no Ingress edge in this fixture and therefore no Exposes. +// +// Differentiator: the TestMain fixture seeds an Ingress backend-ref'd to +// the core Service "nginx" in "default". The Knative Service "nginx" in +// "default" (seeded below into the dynamic cache) is a separate topology +// node with NO incoming Ingress edges. The test asserts that the +// resourceContext returned for the ?group=serving.knative.dev request +// does NOT advertise that Ingress — the same fixture, when queried +// without ?group, DOES surface it (locked down by the trailing sub-test +// to pin the regression's pre-fix shape and prevent a future change that +// silently drops the core-side relationship as well). +func TestAIGetResource_GroupRoutesRelationshipsToKnative(t *testing.T) { + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + // The Knative Service request MUST NOT inherit the core Service's + // Ingress in resourceContext.exposes. Pre-fix, the URL "services" was + // passed into buildNodeID and resolved to "service/default/nginx" — + // the wrong topology node — so the Ingress leaked. + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev") + if err != nil { + t.Fatalf("GET (knative): %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var knBody map[string]any + if err := json.NewDecoder(resp.Body).Decode(&knBody); err != nil { + t.Fatalf("decode (knative): %v", err) + } + knRC, _ := knBody["resourceContext"].(map[string]any) + if knRC == nil { + t.Fatal("knative response missing resourceContext") + } + exposes, _ := knRC["exposes"].([]any) + for _, e := range exposes { + em, _ := e.(map[string]any) + kind, _ := em["kind"].(string) + name, _ := em["name"].(string) + if kind == "Ingress" && name == "nginx-ingress" { + t.Fatalf("knative-routed request leaked the core Service's Ingress into resourceContext.exposes "+ + "(got %+v) — relationship lookup did NOT remap to the knativeservice topology node; "+ + "check that handleAIGetResource is funneling kind through topology.KindForGVK", exposes) + } + } + + // Co-anchored sibling: when no ?group is passed, the same path resolves + // to the core Service node and MUST still surface the Ingress. This + // half guards against an over-correction that nukes the relationship + // lookup for the dominant typed-cache case while fixing the CRD case. + respCore, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx") + if err != nil { + t.Fatalf("GET (core): %v", err) + } + defer respCore.Body.Close() + var coreBody map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreBody); err != nil { + t.Fatalf("decode (core): %v", err) + } + coreRC, _ := coreBody["resourceContext"].(map[string]any) + coreExposes, _ := coreRC["exposes"].([]any) + foundIngress := false + for _, e := range coreExposes { + em, _ := e.(map[string]any) + if em["kind"] == "Ingress" && em["name"] == "nginx-ingress" { + foundIngress = true + break + } + } + if !foundIngress { + t.Errorf("core Service request lost the Ingress from resourceContext.exposes (got %+v) — "+ + "the fix overshot and broke the typed-cache relationship lookup", coreExposes) + } +} + +// Pin Finding 1: the AI GET handler used to pass the URL-plural kind +// ("deployments") into computeIssueSummaryForResource, which forwards +// it to issues.Compose via Filters.Kinds. The composer's applyFilters +// case-folds both sides (strings.ToLower) but does NOT plural-to-singular +// convert — and Issue.Kind is the canonical Pascal singular ("Deployment"). +// So the filter set {"deployments"} never matched lower("Deployment") = +// "deployment", every issue got dropped, and IssueSummary.Count silently +// collapsed to 0 (Build then omits the field entirely). +// +// Fix: pass canonicalKind (derived from obj.GVK) into +// computeIssueSummaryForResource so the filter is "Deployment" → matched. +// +// Fixture: TestMain seeds Deployment broken/stuck-app with +// UnavailableReplicas=3. DetectProblems emits a Pascal-singular +// "Deployment" problem for it. Hitting /api/ai/resources/deployments/... +// (URL plural) must surface the issue in resourceContext.issueSummary +// with count > 0 — pre-fix this came back as null. +func TestAIGetResource_IssueSummaryCountsURLPluralKind(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/deployments/broken/stuck-app") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + rc, _ := body["resourceContext"].(map[string]any) + if rc == nil { + t.Fatal("response missing resourceContext") + } + issueSum, _ := rc["issueSummary"].(map[string]any) + if issueSum == nil { + t.Fatalf("resourceContext.issueSummary is nil — composer filter dropped every issue. "+ + "Likely the handler is still passing URL-plural kind ('deployments') into "+ + "computeIssueSummaryForResource instead of canonical Pascal singular ('Deployment'). "+ + "Got: %+v", rc) + } + count, _ := issueSum["count"].(float64) + if count < 1 { + t.Fatalf("issueSummary.count = %v, want >= 1 — DetectProblems should have flagged "+ + "the broken/stuck-app Deployment (UnavailableReplicas=3)", count) + } +} + // Happy-path sibling for the test above: when no group is passed, the // typed-cache-first path is correct (and must continue to be — the v1 // core Service is the dominant case and must not pay a dynamic-cache diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 90dcbef37..21fb3e216 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -11,7 +11,11 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" +<<<<<<< HEAD rbacv1 "k8s.io/api/rbac/v1" +======= + networkingv1 "k8s.io/api/networking/v1" +>>>>>>> b01d112 (fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup) metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -28,6 +32,7 @@ var ( func TestMain(m *testing.M) { replicas := int32(1) + brokenReplicas := int32(3) deployUID := "deploy-uid-1234" rsUID := "rs-uid-5678" @@ -37,6 +42,37 @@ func TestMain(m *testing.M) { ObjectMeta: metav1.ObjectMeta{Name: "default"}, Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, }, + &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "broken"}, + Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, + }, + // Broken Deployment in its own namespace so it doesn't perturb the + // "default" fixture used by every other smoke test. Used by + // TestAIGetResource_IssueSummaryCountsURLPluralKind to assert the + // composer's URL-plural-kind filter actually matches the canonical + // Pascal-singular Issue.Kind values — pre-fix, count was 0. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stuck-app", + Namespace: "broken", + Labels: map[string]string{"app": "stuck"}, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &brokenReplicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "stuck"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "stuck"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "stuck", Image: "registry.example/stuck:1"}}}, + }, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 3, + AvailableReplicas: 0, + UnavailableReplicas: 3, + }, + }, &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "nginx", @@ -137,6 +173,37 @@ func TestMain(m *testing.M) { Ports: []corev1.ServicePort{{Port: 80, TargetPort: intstr.FromInt(80)}}, }, }, + // Ingress routing to the core Service "nginx". Used by + // TestAIGetResource_GroupRoutesRelationshipsToKnative to give the + // core Service a distinct incoming edge (EdgeRoutesTo) that the + // Knative Service node does NOT inherit — the test compares whether + // the AI GET handler picks up that edge under ?group=serving.knative.dev + // (regression for the kind-passed-to-relationship-lookup bug). + &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-ingress", + Namespace: "default", + }, + Spec: networkingv1.IngressSpec{ + Rules: []networkingv1.IngressRule{{ + Host: "nginx.example.com", + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{{ + Path: "/", + PathType: func() *networkingv1.PathType { p := networkingv1.PathTypePrefix; return &p }(), + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: "nginx", + Port: networkingv1.ServiceBackendPort{Number: 80}, + }, + }, + }}, + }, + }, + }}, + }, + }, // Seed Secrets in two namespaces so per-user RBAC tests can // distinguish "gate denied → []" from "no secrets in cache" and can // exercise the partial-allow case (one ns allowed, the other denied). diff --git a/pkg/topology/pseudokinds.go b/pkg/topology/pseudokinds.go new file mode 100644 index 000000000..f1f8e500f --- /dev/null +++ b/pkg/topology/pseudokinds.go @@ -0,0 +1,48 @@ +package topology + +// KindForGVK maps a (kind, group) pair to the topology-internal pseudo-kind +// the builder uses for node IDs. The topology builder synthesizes pseudo-kinds +// for a handful of CRDs whose Kind collides with a core kind under a different +// API group — these collisions would otherwise produce ambiguous node IDs. +// +// Callers that already hold the resource's apiVersion (i.e., obj.GVK) and want +// to look up the matching topology node MUST funnel kind through this helper, +// otherwise buildNodeID would resolve to the core node and return relationships +// for the wrong object. +// +// Today the cross-group collisions are: +// +// serving.knative.dev/Service → "knativeservice" +// serving.knative.dev/Configuration → "knativeconfiguration" +// serving.knative.dev/Revision → "knativerevision" +// serving.knative.dev/Route → "knativeroute" +// cluster.x-k8s.io/Cluster → "capicluster" +// networking.istio.io/Gateway → "istiogateway" +// +// For any other (kind, group) pair — including core kinds with group=="" and +// non-colliding CRDs — KindForGVK returns kind unchanged. buildNodeID's own +// kindMap then handles URL-plural-to-singular flattening. +func KindForGVK(kind, group string) string { + switch group { + case "serving.knative.dev": + switch kind { + case "Service": + return "knativeservice" + case "Configuration": + return "knativeconfiguration" + case "Revision": + return "knativerevision" + case "Route": + return "knativeroute" + } + case "cluster.x-k8s.io": + if kind == "Cluster" { + return "capicluster" + } + case "networking.istio.io": + if kind == "Gateway" { + return "istiogateway" + } + } + return kind +} diff --git a/pkg/topology/pseudokinds_test.go b/pkg/topology/pseudokinds_test.go new file mode 100644 index 000000000..91786564e --- /dev/null +++ b/pkg/topology/pseudokinds_test.go @@ -0,0 +1,61 @@ +package topology + +import "testing" + +// KindForGVK is the bridge between (obj.Kind, obj.Group) and the topology +// builder's pseudo-kind node-ID prefix. The builder emits pseudo-kinds for +// CRDs whose Kind collides with a core kind under a different group +// (Knative Service vs core Service, CAPI Cluster vs… nothing today but a +// future "Cluster" core kind, Istio Gateway vs Gateway API Gateway). +// +// A regression in this helper silently routes single-resource relationship +// lookups for those CRDs to the wrong topology node, so the table covers +// every group remapping plus the pass-through cases. +func TestKindForGVK(t *testing.T) { + tests := []struct { + name string + kind string + group string + want string + }{ + // Knative Serving collisions. + {"knative service", "Service", "serving.knative.dev", "knativeservice"}, + {"knative configuration", "Configuration", "serving.knative.dev", "knativeconfiguration"}, + {"knative revision", "Revision", "serving.knative.dev", "knativerevision"}, + {"knative route", "Route", "serving.knative.dev", "knativeroute"}, + // CAPI collision (Cluster, distinct from any future "Cluster" core kind). + {"capi cluster", "Cluster", "cluster.x-k8s.io", "capicluster"}, + // Istio Gateway collision (vs Gateway API's gateway.networking.k8s.io/Gateway). + {"istio gateway", "Gateway", "networking.istio.io", "istiogateway"}, + + // Pass-through: core kinds (group == ""). + {"core service passthrough", "Service", "", "Service"}, + {"core pod passthrough", "Pod", "", "Pod"}, + // Pass-through: apps group. + {"apps deployment passthrough", "Deployment", "apps", "Deployment"}, + {"batch job passthrough", "Job", "batch", "Job"}, + // Pass-through: Gateway API (uses the gateway.networking.k8s.io group, + // distinct from networking.istio.io — must NOT be remapped to istiogateway). + {"gateway api gateway passthrough", "Gateway", "gateway.networking.k8s.io", "Gateway"}, + // Pass-through: non-colliding CRDs. + {"argo application passthrough", "Application", "argoproj.io", "Application"}, + {"cert-manager certificate passthrough", "Certificate", "cert-manager.io", "Certificate"}, + // Pass-through: a Kind that matches a Knative collision but under the + // wrong group must NOT remap. Guards against accidental kind-only + // matching that would mis-classify e.g. core Route or future CRDs. + {"route under wrong group", "Route", "route.openshift.io", "Route"}, + {"service under wrong group", "Service", "argoproj.io", "Service"}, + // Empty kind: pass-through (caller's problem to validate). + {"empty kind", "", "serving.knative.dev", ""}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := KindForGVK(tc.kind, tc.group) + if got != tc.want { + t.Errorf("KindForGVK(%q, %q) = %q, want %q", tc.kind, tc.group, got, tc.want) + } + }) + } +} From 9f7887cd40c2b0bedeeb79cf1a73deb334cdbcfb Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 12:21:31 +0300 Subject: [PATCH 09/17] chore(resourcecontext): drop Hints prose projection from v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the prose `Hints []string` field, the `EmitHints` option, the SynthesizeHints generator, and all associated tests. ~430 LoC of net removal. Rationale: our dominant agent consumer (Claude-class) composes triage prose from the structured fields trivially. The pre-baked Hints array added wire bytes + prompt tokens with no net signal for that consumer. Once shipped, agents pattern-matching on Hint substrings would have ossified the wording. The structured fields (ManagedBy, Exposes, SelectedBy, Uses, RunsOn, ScaledBy, IssueSummary, AuditSummary, PolicySummary, Omitted) carry every fact a derived prose line would encode — agents that need narrative can compose it. If a real consumer emerges that needs deterministic prose, add it as a separate `explain_resource` MCP tool. Keeping it inline was a premature bet against asymmetric costs: easy to add later, hard to remove or evolve once agents depend on the strings. The doc comment on ResourceContext records the decision so future readers don't re-introduce the projection without revisiting the tradeoff. --- internal/server/ai_handlers.go | 5 +- pkg/resourcecontext/build.go | 9 -- pkg/resourcecontext/build_test.go | 61 -------- pkg/resourcecontext/hints.go | 241 ------------------------------ pkg/resourcecontext/hints_test.go | 118 --------------- pkg/resourcecontext/types.go | 12 +- pkg/resourcecontext/types_test.go | 3 - 7 files changed, 9 insertions(+), 440 deletions(-) delete mode 100644 pkg/resourcecontext/hints.go delete mode 100644 pkg/resourcecontext/hints_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index f359904cd..a188a92e1 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -314,7 +314,6 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, AccessChecker: s.newRequestScopedChecker(r), - EmitHints: true, IssueSummary: issueSum, AuditSummary: auditSum, } @@ -463,8 +462,8 @@ func composeSeverityRank(s issues.Severity) int { // // TopFinding is selected deterministically: highest severity wins, with // CheckID as the ascending tiebreaker. Map iteration ordering does NOT -// influence the choice — relevant because SynthesizeHints downstream -// advertises deterministic output. +// influence the choice — agents pinning regression tests on +// resourceContext output rely on stable field values across runs. func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { if cache == nil || kind == "" { return nil diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index e8ef0f93a..4f294a3bb 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -59,10 +59,6 @@ type Options struct { IssueSummary *IssueSummary AuditSummary *AuditSummary PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings - - // EmitHints controls whether SynthesizeHints runs over the structured - // fields. AI-facing callers (MCP, /api/ai/*) set true; UI callers false. - EmitHints bool } // PolicyReportLookup is the minimal interface Build needs from the @@ -238,11 +234,6 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } } - // 6. Hints — AI-only. - if opts.EmitHints { - rc.Hints = SynthesizeHints(rc, opts.Tier) - } - rc.Omitted = omitted.collect() return rc } diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 3a71b81ec..d14c474c9 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -3,7 +3,6 @@ package resourcecontext import ( "context" "encoding/json" - "reflect" "testing" appsv1 "k8s.io/api/apps/v1" @@ -132,7 +131,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, IssueSummary: &IssueSummary{ Count: 1, HighestSeverity: "critical", TopReason: "ImagePullBackOff", BySource: map[string]int{"problem": 1}, @@ -203,20 +201,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) } - // Hints: deterministic ordering covers the high-signal fields. - wantHints := []string{ - "Managed by Application storefront", - "1 issue (critical: ImagePullBackOff)", - "Running on node node-1", - "Exposed by 1 Service", - "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", - "Scaled by 1 HorizontalPodAutoscaler", - "Uses 2 ConfigMaps, 2 Secrets, 1 PVC, ServiceAccount web-sa", - } - if !reflect.DeepEqual(rc.Hints, wantHints) { - t.Errorf("Hints mismatch.\n got: %v\nwant: %v", rc.Hints, wantHints) - } - // Pre-computed summaries are passed through. if rc.IssueSummary == nil || rc.IssueSummary.Count != 1 { t.Errorf("IssueSummary not passed through: %+v", rc.IssueSummary) @@ -243,7 +227,6 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { rc := Build(context.Background(), dep, Options{ Tier: TierBasic, AccessChecker: allowAllChecker{}, - EmitHints: true, }) if rc == nil { t.Fatal("Build returned nil") @@ -258,10 +241,6 @@ func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { if mb.Group != "helm.toolkit.fluxcd.io" { t.Errorf("ManagedBy[0].Group: got %q", mb.Group) } - wantHint := "Managed by HelmRelease web" - if len(rc.Hints) == 0 || rc.Hints[0] != wantHint { - t.Errorf("first Hint: got %v want %q", rc.Hints, wantHint) - } } func TestBuild_Service_ExposedByIngress(t *testing.T) { @@ -281,7 +260,6 @@ func TestBuild_Service_ExposedByIngress(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if got, want := len(rc.Exposes), 1; got != want { @@ -317,7 +295,6 @@ func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if rc == nil { t.Fatal("Build returned nil") @@ -354,7 +331,6 @@ func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) if got, want := len(rc.ManagedBy), 1; got != want { t.Fatalf("ManagedBy len: got %d want %d", got, want) @@ -382,7 +358,6 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { rc := Build(context.Background(), pod, Options{ Tier: TierBasic, AccessChecker: denyChecker{group: "", kind: "Secret", namespace: "prod"}, - EmitHints: true, }) if rc.Uses != nil && len(rc.Uses.Secrets) != 0 { t.Errorf("Secrets should be empty after deny; got %+v", rc.Uses.Secrets) @@ -399,30 +374,6 @@ func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { } } -func TestBuild_EmitHintsFalse_NoHints(t *testing.T) { - // Flux Helm labels — detected from obj metadata directly via - // topology.SynthesizeManagedBy without needing a populated Topology. - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod", - Labels: map[string]string{ - "helm.toolkit.fluxcd.io/name": "web", - "helm.toolkit.fluxcd.io/namespace": "flux-system", - }}, - } - rc := Build(context.Background(), dep, Options{ - Tier: TierBasic, - AccessChecker: allowAllChecker{}, - EmitHints: false, - }) - if len(rc.Hints) != 0 { - t.Errorf("EmitHints=false but got hints: %v", rc.Hints) - } - // Structured fields still populated. - if len(rc.ManagedBy) != 1 { - t.Errorf("ManagedBy should still be populated: %+v", rc.ManagedBy) - } -} - func TestBuild_NilObj(t *testing.T) { if rc := Build(context.Background(), nil, Options{}); rc != nil { t.Errorf("Build(nil) = %+v, want nil", rc) @@ -457,7 +408,6 @@ func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, PolicyReports: reports, - EmitHints: true, }) if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) @@ -469,16 +419,6 @@ func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { if len(k.Top) != 0 { t.Errorf("basic tier must NOT emit Top[]; got %d entries: %+v", len(k.Top), k.Top) } - gotHint := false - for _, h := range rc.Hints { - if h == "Kyverno: 1 failing, 1 warning" { - gotHint = true - break - } - } - if !gotHint { - t.Errorf("expected Kyverno hint; got %v", rc.Hints) - } } func TestBuild_PolicyReports_DiagnosticTierIncludesTop(t *testing.T) { @@ -540,7 +480,6 @@ func TestBuild_PDB_OutputJSONShape(t *testing.T) { Tier: TierBasic, AccessChecker: allowAllChecker{}, Topology: topo, - EmitHints: true, }) b, err := json.MarshalIndent(rc, "", " ") if err != nil { diff --git a/pkg/resourcecontext/hints.go b/pkg/resourcecontext/hints.go deleted file mode 100644 index a385f455f..000000000 --- a/pkg/resourcecontext/hints.go +++ /dev/null @@ -1,241 +0,0 @@ -package resourcecontext - -import ( - "fmt" - "sort" - "strings" -) - -// SynthesizeHints renders a short, deterministic prose summary of the -// structured fields in c. Returns at most maxHintsBasic lines for -// TierBasic; future tiers can expand the budget. -// -// Ordering is fixed (not data-driven) so golden tests stay stable across -// runs. No LLM is involved — every line maps to a single rule. -// -// Callers SHOULD NOT parse hints — the structured fields are the canonical -// surface. Hints exist solely as a prose convenience for AI consumers. -func SynthesizeHints(c *ResourceContext, tier ContextTier) []string { - if c == nil { - return nil - } - - max := maxHintsBasic - if tier == TierDiagnostic { - max = maxHintsDiagnostic - } - - out := make([]string, 0, max) - - if h := managedByHint(c.ManagedBy); h != "" { - out = append(out, h) - } - if h := issueHint(c.IssueSummary); h != "" { - out = append(out, h) - } - if h := auditHint(c.AuditSummary); h != "" { - out = append(out, h) - } - if h := runsOnHint(c.RunsOn); h != "" { - out = append(out, h) - } - if h := exposesHint(c.Exposes); h != "" { - out = append(out, h) - } - if h := selectedByHint(c.SelectedBy); h != "" { - out = append(out, h) - } - if h := scaledByHint(c.ScaledBy); h != "" { - out = append(out, h) - } - if h := usesHint(c.Uses); h != "" { - out = append(out, h) - } - if h := policyHint(c.PolicySummary); h != "" { - out = append(out, h) - } - - if len(out) > max { - out = out[:max] - } - if len(out) == 0 { - return nil - } - return out -} - -const ( - maxHintsBasic = 8 - maxHintsDiagnostic = 12 -) - -func managedByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - m := refs[0] - return fmt.Sprintf("Managed by %s %s", m.Kind, m.Name) -} - -func issueHint(s *IssueSummary) string { - if s == nil || s.Count == 0 { - return "" - } - noun := pluralize("issue", s.Count) - var b strings.Builder - fmt.Fprintf(&b, "%d %s", s.Count, noun) - if s.HighestSeverity != "" { - fmt.Fprintf(&b, " (%s", s.HighestSeverity) - if s.TopReason != "" { - fmt.Fprintf(&b, ": %s", s.TopReason) - } - b.WriteString(")") - } else if s.TopReason != "" { - fmt.Fprintf(&b, ": %s", s.TopReason) - } - return b.String() -} - -func auditHint(s *AuditSummary) string { - if s == nil || s.Count == 0 { - return "" - } - noun := pluralize("audit finding", s.Count) - if s.HighestSeverity == "" { - return fmt.Sprintf("%d %s", s.Count, noun) - } - return fmt.Sprintf("%d %s (%s)", s.Count, noun, s.HighestSeverity) -} - -func runsOnHint(r *ContextRef) string { - if r == nil { - return "" - } - return fmt.Sprintf("Running on node %s", r.Name) -} - -func exposesHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - return fmt.Sprintf("Exposed by %s", summarizeKindsCounts(refs)) -} - -func selectedByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - // Distinguish known SelectedBy kinds (PDB vs NetworkPolicy) in the hint — - // they read very differently to a human, and lumping them together loses - // signal. Match each kind explicitly: a future kind added to SelectedBy - // (e.g. ValidatingAdmissionPolicy) would otherwise be silently rendered - // as NetworkPolicy. Unrecognized kinds drop through to summarizeKindsCounts. - var pdb, np, other []ContextRef - for _, r := range refs { - switch r.Kind { - case "PodDisruptionBudget": - pdb = append(pdb, r) - case "NetworkPolicy": - np = append(np, r) - default: - other = append(other, r) - } - } - parts := make([]string, 0, 3) - if n := len(np); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("NetworkPolicy", n))) - } - if n := len(pdb); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("PodDisruptionBudget", n))) - } - if len(other) > 0 { - parts = append(parts, summarizeKindsCounts(other)) - } - return strings.Join(parts, " and ") + " " + selectVerb(len(refs)) -} - -func selectVerb(n int) string { - if n == 1 { - return "selects this resource" - } - return "select this resource" -} - -func scaledByHint(refs []ContextRef) string { - if len(refs) == 0 { - return "" - } - return fmt.Sprintf("Scaled by %s", summarizeKindsCounts(refs)) -} - -func usesHint(u *UsesBlock) string { - if u == nil { - return "" - } - parts := make([]string, 0, 4) - if n := len(u.ConfigMaps); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("ConfigMap", n))) - } - if n := len(u.Secrets); n > 0 { - parts = append(parts, fmt.Sprintf("%d %s", n, pluralize("Secret", n))) - } - if n := len(u.PVCs); n > 0 { - parts = append(parts, fmt.Sprintf("%d PVCs", n)) - if n == 1 { - parts[len(parts)-1] = "1 PVC" - } - } - if u.ServiceAccount != nil { - parts = append(parts, fmt.Sprintf("ServiceAccount %s", u.ServiceAccount.Name)) - } - if len(parts) == 0 { - return "" - } - return "Uses " + strings.Join(parts, ", ") -} - -func policyHint(s *PolicySummary) string { - if s == nil || s.Kyverno == nil { - return "" - } - k := s.Kyverno - if k.Fail == 0 && k.Warn == 0 { - return "" - } - parts := make([]string, 0, 2) - if k.Fail > 0 { - parts = append(parts, fmt.Sprintf("%d failing", k.Fail)) - } - if k.Warn > 0 { - parts = append(parts, fmt.Sprintf("%d warning", k.Warn)) - } - return "Kyverno: " + strings.Join(parts, ", ") -} - -// summarizeKindsCounts groups refs by kind and emits "N Kind, M OtherKind" -// (deterministic order: alphabetical by kind). -func summarizeKindsCounts(refs []ContextRef) string { - counts := make(map[string]int) - for _, r := range refs { - counts[r.Kind]++ - } - kinds := make([]string, 0, len(counts)) - for k := range counts { - kinds = append(kinds, k) - } - sort.Strings(kinds) - parts := make([]string, 0, len(kinds)) - for _, k := range kinds { - parts = append(parts, fmt.Sprintf("%d %s", counts[k], pluralize(k, counts[k]))) - } - return strings.Join(parts, ", ") -} - -// pluralize returns word + "s" when n != 1. Kept English-only; resource -// kinds are loanwords (Pod, Service, etc.) so naive pluralization works. -func pluralize(word string, n int) string { - if n == 1 { - return word - } - return word + "s" -} diff --git a/pkg/resourcecontext/hints_test.go b/pkg/resourcecontext/hints_test.go deleted file mode 100644 index 8bc0a30d6..000000000 --- a/pkg/resourcecontext/hints_test.go +++ /dev/null @@ -1,118 +0,0 @@ -package resourcecontext - -import ( - "reflect" - "testing" -) - -func TestSynthesizeHints_NilCtx(t *testing.T) { - if got := SynthesizeHints(nil, TierBasic); got != nil { - t.Errorf("nil ctx: got %v, want nil", got) - } -} - -func TestSynthesizeHints_EmptyCtx(t *testing.T) { - rc := &ResourceContext{Tier: TierBasic} - got := SynthesizeHints(rc, TierBasic) - if got != nil { - t.Errorf("empty rc: got %v, want nil", got) - } -} - -func TestSynthesizeHints_DeterministicOrdering(t *testing.T) { - rc := &ResourceContext{ - ManagedBy: []ContextRef{{Kind: "Application", Name: "store"}}, - Exposes: []ContextRef{{Kind: "Service", Name: "api"}}, - SelectedBy: []ContextRef{ - {Kind: "NetworkPolicy", Name: "deny"}, - {Kind: "PodDisruptionBudget", Name: "pdb"}, - }, - ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "hpa"}}, - RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, - Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}}, - IssueSummary: &IssueSummary{Count: 2, HighestSeverity: "warning", TopReason: "Backoff"}, - AuditSummary: &AuditSummary{Count: 3, HighestSeverity: "danger"}, - } - want := []string{ - "Managed by Application store", - "2 issues (warning: Backoff)", - "3 audit findings (danger)", - "Running on node n1", - "Exposed by 1 Service", - "1 NetworkPolicy and 1 PodDisruptionBudget select this resource", - "Scaled by 1 HorizontalPodAutoscaler", - "Uses 1 ConfigMap", - } - got := SynthesizeHints(rc, TierBasic) - if !reflect.DeepEqual(got, want) { - t.Errorf("hints mismatch:\n got: %v\nwant: %v", got, want) - } -} - -func TestSynthesizeHints_BasicTierCapped(t *testing.T) { - // Synthesize a maxed-out context and verify the basic tier caps at - // maxHintsBasic lines. This guards against unbounded hint growth. - rc := &ResourceContext{ - ManagedBy: []ContextRef{{Kind: "App", Name: "a"}}, - Exposes: []ContextRef{{Kind: "Service", Name: "svc"}}, - SelectedBy: []ContextRef{{Kind: "PodDisruptionBudget", Name: "p"}, {Kind: "NetworkPolicy", Name: "n"}}, - ScaledBy: []ContextRef{{Kind: "HorizontalPodAutoscaler", Name: "h"}}, - RunsOn: &ContextRef{Kind: "Node", Name: "n1"}, - Uses: &UsesBlock{ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "c"}}, Secrets: []ContextRef{{Kind: "Secret", Name: "s"}}}, - IssueSummary: &IssueSummary{Count: 1, HighestSeverity: "critical", TopReason: "Crash"}, - AuditSummary: &AuditSummary{Count: 1, HighestSeverity: "danger", TopFinding: "CKV_K8S_1"}, - PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Fail: 1, Warn: 1}}, - } - got := SynthesizeHints(rc, TierBasic) - if len(got) > maxHintsBasic { - t.Errorf("basic tier exceeded cap: got %d hints, want ≤%d (%v)", len(got), maxHintsBasic, got) - } -} - -func TestSynthesizeHints_IssueHint_NoSeverity(t *testing.T) { - rc := &ResourceContext{IssueSummary: &IssueSummary{Count: 1, TopReason: "Pending"}} - got := SynthesizeHints(rc, TierBasic) - want := []string{"1 issue: Pending"} - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestSynthesizeHints_PolicyHint_OnlyPass_Skipped(t *testing.T) { - rc := &ResourceContext{PolicySummary: &PolicySummary{Kyverno: &KyvernoSummary{Pass: 3}}} - got := SynthesizeHints(rc, TierBasic) - if got != nil { - t.Errorf("only-pass summary should not emit a hint; got %v", got) - } -} - -func TestUsesHint_PVCSingular(t *testing.T) { - rc := &ResourceContext{Uses: &UsesBlock{PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data"}}}} - got := SynthesizeHints(rc, TierBasic) - want := []string{"Uses 1 PVC"} - if !reflect.DeepEqual(got, want) { - t.Errorf("got %v, want %v", got, want) - } -} - -func TestSelectVerb(t *testing.T) { - if selectVerb(1) != "selects this resource" { - t.Errorf("verb(1): %q", selectVerb(1)) - } - if selectVerb(2) != "select this resource" { - t.Errorf("verb(2): %q", selectVerb(2)) - } -} - -func TestSummarizeKindsCounts_AlphabeticalOrder(t *testing.T) { - refs := []ContextRef{ - {Kind: "Service", Name: "a"}, - {Kind: "Ingress", Name: "b"}, - {Kind: "Service", Name: "c"}, - } - got := summarizeKindsCounts(refs) - want := "1 Ingress, 2 Services" - if got != want { - t.Errorf("got %q want %q", got, want) - } -} diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 15f5cfee7..6e98d0bec 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -20,10 +20,13 @@ package resourcecontext // response. Every field is optional; the zero value is a valid (empty) // "basic"-tier context. // -// Hints is an optional, presentation-only field — populated by AI-facing -// callers (MCP, /api/ai/*) and omitted by UI-facing callers. The structured -// fields above are the canonical facts; hints are a derived prose -// projection. +// All fields are structured. A prose `Hints []string` projection was +// considered (and prototyped) but cut from v1: our dominant agent consumer +// composes triage prose from the structured fields itself, the additional +// wire bytes earned no net signal, and once shipped, agents pattern-matching +// on hint substrings would have ossified the wording. If a real consumer +// emerges that needs deterministic prose, add it as a separate +// `explain_resource` tool rather than re-introducing it inline here. type ResourceContext struct { Tier ContextTier `json:"tier"` ManagedBy []ContextRef `json:"managedBy,omitempty"` @@ -35,7 +38,6 @@ type ResourceContext struct { IssueSummary *IssueSummary `json:"issueSummary,omitempty"` AuditSummary *AuditSummary `json:"auditSummary,omitempty"` PolicySummary *PolicySummary `json:"policySummary,omitempty"` - Hints []string `json:"hints,omitempty"` Omitted []OmittedField `json:"omitted,omitempty"` } diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index 2e891594c..f8174fe3e 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -105,7 +105,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { IssueSummary: &IssueSummary{Count: 1}, AuditSummary: &AuditSummary{Count: 2}, PolicySummary: &PolicySummary{}, - Hints: []string{"hint"}, Omitted: []OmittedField{{Field: "selectedBy", Reason: OmittedRBACDenied}}, } b, err := json.Marshal(ac) @@ -124,7 +123,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { `"issueSummary"`, `"auditSummary"`, `"policySummary"`, - `"hints"`, `"omitted"`, } prev := -1 @@ -197,7 +195,6 @@ func TestResourceContextRoundTrip(t *testing.T) { }}, }, }, - Hints: []string{"Managed by Deployment api"}, Omitted: []OmittedField{ {Field: "selectedBy.networkPolicies", Reason: OmittedRBACDenied}, {Field: "policySummary.kyverno", Reason: OmittedNotInstalled}, From 1b56e9ba11dcd0ba44cb6ff340b602d8c45ef0b9 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 13:27:45 +0300 Subject: [PATCH 10/17] docs(ai): declare /api/ai/* outside the OpenAPI spec, with reasoning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Radar is OpenAPI-first per CLAUDE.md, but that discipline was adopted to keep the Go backend and the TypeScript SPA client in sync — one spec, regenerated as server stubs + TS client. /api/ai/* has no SPA consumer; agents read MCP tool descriptions or in-prompt instructions, not OpenAPI specs. Without this doc, a future maintainer might reflexively bring /api/ai/* under openapi.yaml on the assumption that radar's OpenAPI-first stance covers every endpoint. That would pay the spec-authoring tax during agent-surface evolution without earning the SDK-generation benefit. The wave-2 wire shapes already evolved across three review rounds; locking them down in YAML during that churn would have produced spec/code drift, not contract clarity. Top-of-file doc records the intentional opt-out + the revisit triggers (surface stability + public-SDK commitment). Future readers see explicit reasoning instead of accidental omission. --- internal/server/ai_handlers.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index a188a92e1..1d7cf8a19 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -1,3 +1,30 @@ +// /api/ai/* is the REST mirror of the MCP agent surface. Both target AI +// consumers (Claude, scripted agents) rather than the SPA, and both +// intentionally evolve at agent-iteration speed. +// +// Unlike /api/* (consumed by the SPA via a generated TypeScript client), +// the /api/ai/* surface is NOT specified in openapi.yaml. The original +// motivation for OpenAPI-first in radar was frontend/backend type safety — +// one spec, regenerated as Go server stubs + TS client. That value +// proposition does not apply here: the agent consumer doesn't read +// OpenAPI specs (it reads MCP tool descriptions or in-prompt instructions), +// and the SPA doesn't call these endpoints at all. +// +// Wire shapes for the agent surface live in pkg/resourcecontext (typed +// JSON DTOs) and pkg/topology. MCP tools document their wire via +// jsonschema struct tags. /api/ai/* follows the same code-defined +// discipline as MCP, treating them as one logical surface served over +// two protocols. +// +// Revisit this opt-out when: +// (a) the agent surface stabilizes (no major shape changes for two +// release cycles), AND +// (b) Skyhook commits to a public customer-facing AI SDK that needs +// generated bindings. +// +// Until both conditions are met, bringing /api/ai/* under openapi.yaml +// is premature — it would pay the spec-authoring tax during evolution +// without earning the SDK-generation benefit. package server import ( From bb3fe6d1d8eed7af4d0735e00c9a841a4b28e056 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 14:44:50 +0300 Subject: [PATCH 11/17] fix(resourcecontext): pseudo-kind in fallback, swap CM/Secret reasons, 500 on unknown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Bugbot findings on PR #721: 1. Build's relationship fallback used `ident.Kind` directly (the raw GVK kind like "Service") when opts.Relationships was nil but opts.Topology was set. For Knative serving.knative.dev/Service this resolved to the CORE Service topology node, leaking core relationships into the CRD's resourceContext — defeating the KindForGVK fix that the handler-side pre-computation already applies. Mirror the handler's resolution here so the fallback agrees with the primary path. 2. buildUsesFromPod had the dominant-pattern Reason labels reversed: ConfigMaps got ReasonEnvVarRef, Secrets got ReasonVolumeMount. ConfigMaps surface primarily via volume mounts (config files); Secrets primarily via env (SecretKeyRef). Swapped to match the common case. Doc comment clarifies the label is best-effort per kind — both paths feed both kinds in practice. 3. writeAIFetchError default branch returned 404 NOT_FOUND for any unrecognized error. Unknown errors are server-side (e.g. "resource discovery not initialized") and should surface as 500, not be masked as missing-resource. One-line fix. No test changes — the fixes are correctness-only and existing tests remain valid. --- internal/server/ai_handlers.go | 5 ++++- pkg/resourcecontext/build.go | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 1d7cf8a19..cdca066ca 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -300,7 +300,10 @@ func (s *Server) writeAIFetchError(w http.ResponseWriter, kind string, err error case strings.Contains(msg, "not found"): s.writeError(w, http.StatusNotFound, msg) default: - s.writeError(w, http.StatusNotFound, msg) + // Unknown errors are server-side problems (e.g. "resource discovery + // not initialized", "dynamic resource cache not initialized") — surface + // as 500 so debugging upstream issues isn't masked by a misleading 404. + s.writeError(w, http.StatusInternalServerError, msg) } } diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 4f294a3bb..4ea5e3cdc 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -113,8 +113,15 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // over relationships per row pass them in directly. rel := opts.Relationships if rel == nil && opts.Topology != nil { + // Resolve the topology-pseudo-kind so cross-group CRDs (Knative + // serving.knative.dev/Service, CAPI cluster.x-k8s.io/Cluster, …) + // look up the right node. Using ident.Kind directly would lower- + // case to "service" and resolve to the core Service node, leaking + // the wrong resource's relationships into the CRD's resourceContext. + // The handler-side pre-computation does this same KindForGVK + // resolution; mirror it here so the fallback path doesn't undo it. rel = topology.GetRelationshipsWithObject( - ident.Kind, ident.Namespace, ident.Name, obj, + topology.KindForGVK(ident.Kind, ident.Group), ident.Namespace, ident.Name, obj, opts.Topology, opts.Provider, opts.DynamicProv, opts.RelIndex, ) } @@ -357,9 +364,16 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) + // ConfigMaps are most often surfaced via volume mounts (volume.ConfigMap) + // and Secrets via env (SecretKeyRef on container.env). Both kinds appear + // via both paths in practice, so the single Reason label per kind is a + // best-effort discriminator: it answers "what's the dominant lookup + // pattern for THIS kind?" — not "how was THIS particular ref discovered?" + // Reversing them (the prior labelling had them swapped) made the + // metadata actively misleading. uses := &UsesBlock{ - ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonEnvVarRef, SourceK8sSpec), "uses.configMaps", omitted), - Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonVolumeMount, SourceK8sSpec), "uses.secrets", omitted), + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonVolumeMount, SourceK8sSpec), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonEnvVarRef, SourceK8sSpec), "uses.secrets", omitted), PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), } From 444195a68da5a278fd01a9643215913a708c1879 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 17:53:48 +0300 Subject: [PATCH 12/17] chore(resourcecontext): drop speculative wire surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove fields and enum values with no consumer in v1 and no concrete plan to populate them in T7/T10/T13: - ContextRef.Reason (RefReason enum, 10 values): structurally redundant with the parent field name (selectedBy → selector match, runsOn → node binding, etc.). The ConfigMap/Secret refs were tagged with a single Reason that was inaccurate for the env-from vs volume-mount half of each set — actively misleading on the wire. - ContextRef.Source (RefSource enum, 5 values): internal provenance describing which radar subsystem produced the fact. No agent or UI consumer branches on it; planned policy_report/audit_engine sources never get a ContextRef anyway (T10 emits PolicySummary/AuditSummary rollups). - ContextRef.Confidence: reserved field, never set. Defer until fuzzy joins (Trivy/ConfigAudit) actually land. - Options.MaxTokens: reserved field, never enforced. Add when there's budgeting code. - ResourceContext.Truncated: never set by Build — no truncation path exists in the generator (RBAC drops are reported via Omitted). - OmittedKindUnsupported + OmittedProviderDisabled: unused enum values. Kept rbac_denied / budget_exceeded / cache_cold / not_installed — all four are populated today or planned for T10's policy-report diagnostic per the TODO in internal/k8s/policy_reports.go. --- pkg/resourcecontext/build.go | 50 ++++++++++--------------------- pkg/resourcecontext/build_test.go | 3 -- 2 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 4ea5e3cdc..46d2576a9 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -25,8 +25,7 @@ import ( // in internal/* pre-compute IssueSummary / AuditSummary / PolicyReports and // pass them in, so we don't reach into internal/issues or internal/audit. type Options struct { - Tier ContextTier - MaxTokens int // reserved for future budgeting; not enforced in v1 + Tier ContextTier // AccessChecker gates every emitted ContextRef. nil = no gating (treat // as fully authorized — local-kubeconfig / tests). @@ -141,7 +140,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } if len(managedBy) > 0 { rc.ManagedBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(managedBy, ReasonOwnerReference, SourceOwnerChain), + toContextRefs(managedBy), "managedBy", omitted) } @@ -153,18 +152,18 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte exposes = append(exposes, rel.Gateways...) exposes = append(exposes, rel.Routes...) rc.Exposes = filterRefs(ctx, opts.AccessChecker, - toContextRefs(exposes, ReasonLabelSelector, SourceTopology), + toContextRefs(exposes), "exposes", omitted) selected := make([]topology.ResourceRef, 0, len(rel.PDBs)+len(rel.NetworkPolicies)) selected = append(selected, rel.PDBs...) selected = append(selected, rel.NetworkPolicies...) rc.SelectedBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(selected, ReasonPodSelector, SourceTopology), + toContextRefs(selected), "selectedBy", omitted) rc.ScaledBy = filterRefs(ctx, opts.AccessChecker, - toContextRefs(rel.Scalers, ReasonScaleTargetRef, SourceTopology), + toContextRefs(rel.Scalers), "scaledBy", omitted) } @@ -187,8 +186,6 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte Group: rel.ServiceAccount.Group, Namespace: rel.ServiceAccount.Namespace, Name: rel.ServiceAccount.Name, - Reason: ReasonSAName, - Source: SourceK8sSpec, } if checkRef(ctx, opts.AccessChecker, candidate) { rc.Uses.ServiceAccount = candidate @@ -213,11 +210,9 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte } if nodeName != "" { candidate := &ContextRef{ - Kind: "Node", - Group: nodeGroup, - Name: nodeName, - Reason: ReasonNodeName, - Source: SourceK8sSpec, + Kind: "Node", + Group: nodeGroup, + Name: nodeName, } if checkRef(ctx, opts.AccessChecker, candidate) { rc.RunsOn = candidate @@ -364,17 +359,10 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, scanContainers(pod.Spec.InitContainers, pod.Namespace, cmSet, secretSet) scanContainers(pod.Spec.Containers, pod.Namespace, cmSet, secretSet) - // ConfigMaps are most often surfaced via volume mounts (volume.ConfigMap) - // and Secrets via env (SecretKeyRef on container.env). Both kinds appear - // via both paths in practice, so the single Reason label per kind is a - // best-effort discriminator: it answers "what's the dominant lookup - // pattern for THIS kind?" — not "how was THIS particular ref discovered?" - // Reversing them (the prior labelling had them swapped) made the - // metadata actively misleading. uses := &UsesBlock{ - ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", "", ReasonVolumeMount, SourceK8sSpec), "uses.configMaps", omitted), - Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", "", ReasonEnvVarRef, SourceK8sSpec), "uses.secrets", omitted), - PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", "", ReasonClaimRef, SourceK8sSpec), "uses.pvcs", omitted), + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", ""), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", ""), "uses.secrets", omitted), + PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", ""), "uses.pvcs", omitted), } if sa := pod.Spec.ServiceAccountName; sa != "" { @@ -382,8 +370,6 @@ func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, Kind: "ServiceAccount", Namespace: pod.Namespace, Name: sa, - Reason: ReasonSAName, - Source: SourceK8sSpec, } if checkRef(ctx, ac, candidate) { uses.ServiceAccount = candidate @@ -478,7 +464,7 @@ func (s *refSet) add(name, ns string) { // refs returns the accumulated set as ContextRefs sorted by (namespace, name) // for deterministic golden output. -func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) []ContextRef { +func (s *refSet) refs(kind, group string) []ContextRef { if len(s.order) == 0 { return nil } @@ -496,8 +482,6 @@ func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) [] Group: group, Namespace: e.Namespace, Name: e.Name, - Reason: reason, - Source: source, } } return out @@ -507,10 +491,10 @@ func (s *refSet) refs(kind, group string, reason RefReason, source RefSource) [] // Topology ref → ContextRef // --------------------------------------------------------------------------- -// toContextRefs translates a slice of topology.ResourceRef into ContextRefs -// with the given reason+source. Sorted by (kind, namespace, name) for -// determinism — golden tests rely on this ordering. -func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSource) []ContextRef { +// toContextRefs translates a slice of topology.ResourceRef into ContextRefs. +// Sorted by (kind, namespace, name) for determinism — golden tests rely on +// this ordering. +func toContextRefs(refs []topology.ResourceRef) []ContextRef { if len(refs) == 0 { return nil } @@ -521,8 +505,6 @@ func toContextRefs(refs []topology.ResourceRef, reason RefReason, source RefSour Group: r.Group, Namespace: r.Namespace, Name: r.Name, - Reason: reason, - Source: source, }) } sort.SliceStable(out, func(i, j int) bool { diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index d14c474c9..9e14da46a 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -150,9 +150,6 @@ func TestBuild_Pod_FullEnrichment(t *testing.T) { if mb.Kind != "Application" || mb.Name != "storefront" || mb.Namespace != "argocd" { t.Errorf("ManagedBy[0]: got %+v, want Application argocd/storefront", mb) } - if mb.Source != SourceOwnerChain { - t.Errorf("ManagedBy[0].Source: got %q want %q", mb.Source, SourceOwnerChain) - } // Exposes: the Service routes to the pod. if got, want := len(rc.Exposes), 1; got != want { From 3e43759b32ba6bc37d97bd236d5709c46153d445 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 18:13:51 +0300 Subject: [PATCH 13/17] fix(resourcecontext): address Bugbot findings on T6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop the handler-side pre-compute of Relationships in buildAIResourceContext. Build's own fallback already does the identical KindForGVK pseudo-kind resolution; pre-computing here doubled the work whenever the lookup returned nil (handler call returned nil → opts.Relationships=nil → Build's `rel==nil && opts.Topology!=nil` branch re-ran the same scan). - Sort issue-summary rows by (severity desc, Reason asc) before picking topReason. The sibling computeAuditSummaryForResource already sorts (severity desc, CheckID asc) for determinism; the issue path's iteration-order pick could vary across runs when multiple rows tie on severity. --- internal/server/ai_handlers.go | 48 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index cdca066ca..789df5c7f 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -359,23 +359,12 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin opts.Topology = topo opts.Provider = prov opts.DynamicProv = dyn - // Pre-compute Relationships once with the already-fetched obj so - // kind/group disambiguation works (Knative serving.knative.dev/Service - // vs core/v1 Service). idx=nil is fine for single-resource: the - // per-call inline scan is O(E) once. Bulk callers (T12/T89) should - // build a shared index via topology.IndexByResource(topo). - // - // Route through KindForGVK so cross-group CRDs whose Kind collides - // with a core kind (Knative {Service, Configuration, Revision, Route}, - // CAPI Cluster, Istio Gateway) resolve to the right topology node. - // The builder writes those as pseudo-kinds (e.g. "knativeservice/...") - // — without remapping, buildNodeID would resolve "services/..." to - // the core Service node and walk the wrong edges. - gvk := obj.GetObjectKind().GroupVersionKind() - relKind := topology.KindForGVK(gvk.Kind, gvk.Group) - opts.Relationships = topology.GetRelationshipsWithObject( - relKind, namespace, name, obj, topo, prov, dyn, nil, - ) + // Relationships are computed inside Build via GetRelationshipsWithObject, + // which applies the same KindForGVK pseudo-kind remap we used to do + // here. Pre-computing in the handler doubled the work whenever the + // lookup returned nil (no edges): handler call returned nil, Build's + // `rel == nil && opts.Topology != nil` fallback re-ran the identical + // scan. Leaving opts.Relationships unset is the canonical path. } return resourcecontext.Build(r.Context(), obj, opts) @@ -442,9 +431,7 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n } rows, _ := issues.ComposeWithStats(provider, filters) - var count int - var topReason string - var topSeverity issues.Severity + matched := make([]issues.Issue, 0, len(rows)) bySource := make(map[string]int) for _, row := range rows { if row.Name != name { @@ -453,16 +440,25 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if namespace != "" && row.Namespace != namespace { continue } - count++ + matched = append(matched, row) bySource[string(row.Source)]++ - if topSeverity == "" || composeSeverityRank(row.Severity) > composeSeverityRank(topSeverity) { - topSeverity = row.Severity - topReason = row.Reason - } } - if count == 0 { + if len(matched) == 0 { return nil } + // Sort by (severity desc, Reason asc) so TopReason is deterministic + // across runs even when multiple rows tie on severity. Mirrors the + // stable sort applied in computeAuditSummaryForResource. + sort.Slice(matched, func(i, j int) bool { + ri, rj := composeSeverityRank(matched[i].Severity), composeSeverityRank(matched[j].Severity) + if ri != rj { + return ri > rj + } + return matched[i].Reason < matched[j].Reason + }) + count := len(matched) + topSeverity := matched[0].Severity + topReason := matched[0].Reason return &resourcecontext.IssueSummary{ Count: count, HighestSeverity: string(topSeverity), From 37983a1bf48e7ca1f036ea909124400ba337f09c Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 21:45:32 +0300 Subject: [PATCH 14/17] fix(audit): match issue summary's nil-namespace guard for cluster-scoped lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit computeAuditSummaryForResource unconditionally passed []string{namespace} to audit.RunFromCache, even when namespace=="" (cluster-scoped resources). That filters audit to literally namespace="" resources instead of scanning all namespaces. computeIssueSummaryForResource already guards with `if namespace != ""` — match it. Latent today since the audit suite doesn't currently cover cluster-scoped kinds, but the inconsistency would silently miss findings the moment a cluster-scoped check lands. --- internal/server/ai_handlers.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 789df5c7f..4bbca9cde 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -494,7 +494,16 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if cache == nil || kind == "" { return nil } - results := audit.RunFromCache(cache, []string{namespace}, nil) + // Match computeIssueSummaryForResource's guard: passing []string{""} to + // RunFromCache would filter to literally namespace="" resources instead + // of scanning all namespaces. Latent today since the audit suite + // doesn't cover cluster-scoped kinds, but the inconsistency would + // silently miss findings the moment a cluster-scoped check lands. + var namespaces []string + if namespace != "" { + namespaces = []string{namespace} + } + results := audit.RunFromCache(cache, namespaces, nil) if results == nil || len(results.Findings) == 0 { return nil } From 3742ffa1bfd4d1dacbcc3dd55e386d18deb8f994 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 22:45:11 +0300 Subject: [PATCH 15/17] =?UTF-8?q?fix:=20rebase=20fallout=20=E2=80=94=20gro?= =?UTF-8?q?up-aware=20FindingsFor=20+=20smoke-test=20import=20merge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two T6 fixes required after rebasing onto post-#723 (Kyverno) and post-#726 (RBAC reverse-lookup + Crossplane) main: - T11 (#723) threaded API group through the PolicyReport lookup, changing policyreports.Index.FindingsFor from (kind, ns, name) to (group, kind, ns, name). Updated resourcecontext.PolicyReportLookup interface, the build.go call site (passing ident.Group), the policyReportLookupAdapter in ai_handlers.go, and the test mock. Group threading is a strict improvement — two CRDs sharing kind+ns+name across API groups now get disjoint findings instead of one inheriting the other's. - internal/server/server_smoke_test.go acquired a `rbacv1` import on main (#726) at the same line our `networkingv1` import lands on this branch. Conflict resolution: keep both, sorted. --- internal/server/ai_handlers.go | 4 ++-- internal/server/server_smoke_test.go | 5 +---- pkg/resourcecontext/build.go | 4 ++-- pkg/resourcecontext/build_test.go | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 4bbca9cde..cd4a0f454 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -59,11 +59,11 @@ type policyReportLookupAdapter struct { idx *policyreports.Index } -func (a policyReportLookupAdapter) FindingsFor(kind, namespace, name string) []resourcecontext.KyvernoFinding { +func (a policyReportLookupAdapter) FindingsFor(group, kind, namespace, name string) []resourcecontext.KyvernoFinding { if a.idx == nil { return nil } - findings := a.idx.FindingsFor(kind, namespace, name) + findings := a.idx.FindingsFor(group, kind, namespace, name) if len(findings) == 0 { return nil } diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 21fb3e216..268283396 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -11,11 +11,8 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" -<<<<<<< HEAD - rbacv1 "k8s.io/api/rbac/v1" -======= networkingv1 "k8s.io/api/networking/v1" ->>>>>>> b01d112 (fix(resourcecontext): canonical kind + cross-group pseudo-kind for relationship lookup) + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index 46d2576a9..49170abf9 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -66,7 +66,7 @@ type Options struct { // Build does not import pkg/policyreports directly because callers may // adapt other policy engines into the same shape. type PolicyReportLookup interface { - FindingsFor(kind, namespace, name string) []KyvernoFinding + FindingsFor(group, kind, namespace, name string) []KyvernoFinding } // RefAccessChecker abstracts the RBAC check so this package doesn't import @@ -230,7 +230,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte // counts only (fail/warn/pass); diagnostic tier adds the top[] // findings. Tier discrimination keeps the basic-tier wire size tight. if opts.PolicyReports != nil { - findings := opts.PolicyReports.FindingsFor(ident.Kind, ident.Namespace, ident.Name) + findings := opts.PolicyReports.FindingsFor(ident.Group, ident.Kind, ident.Namespace, ident.Name) if len(findings) > 0 { rc.PolicySummary = buildPolicySummary(findings, opts.Tier) } diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 9e14da46a..c1fe2a5c0 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -41,7 +41,7 @@ func (d denyChecker) CanRead(_ context.Context, group, kind, namespace string) b // mockPolicyReports implements PolicyReportLookup. type mockPolicyReports map[string][]KyvernoFinding -func (m mockPolicyReports) FindingsFor(kind, namespace, name string) []KyvernoFinding { +func (m mockPolicyReports) FindingsFor(group, kind, namespace, name string) []KyvernoFinding { return m[kind+"/"+namespace+"/"+name] } From 9d6658a125718d904314574bf7dd285b11fe1e6f Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Mon, 18 May 2026 23:55:29 +0300 Subject: [PATCH 16/17] fix: group-aware audit + issue summary lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-group kind collisions (e.g. core/Service vs serving.knative.dev/Service sharing namespace + name) previously caused audit/issue summaries to silently inherit findings from the wrong resource. The fetch and relationship paths in T6 were already group-aware; the summary lookups were not. pkg/audit: - Finding and ResourceGroup gain a Group field (omitempty on the wire). - ResourceKey signature changes to (group, kind, ns, name) — encoded "group|Kind|ns|name" with "|" delimiter matching the issue-source key in internal/summarycontext. - Exported GroupForBuiltinKind(kind) — single source of truth for the built-in (Kind→Group) map. buildResults populates Finding.Group via this helper so the 20 per-check emission sites stay terse. internal/server: - buildAIResourceContext threads canonicalGroup from obj GVK through to both computeIssueSummaryForResource and computeAuditSummaryForResource. - Audit lookup uses the new group-aware ResourceKey. - Issue summary adds a strict row.Group == group check inside the match loop. The previous kind-only filter could silently pull in a colliding core kind's issues for a CRD lookup. - UI audit drill-down (handleResourceAudit) explicitly passes group="" since the URL doesn't carry group today — comment points to #35 for the CRD-aware drill-down work. internal/issues: - Centralised Group resolution via resolveGroup(group, kind): use the explicit group if set, else fall back to audit.GroupForBuiltinKind. - Applied at fromProblem (legacy k8s.DetectProblems sites still emit Group="" for built-in workloads), fromAudit (passes through audit.Finding.Group which buildResults populates), fromWarningEvent (split apiVersion → group, with "v1"-shape → core). Pin tests: - TestResourceKey_GroupAware: distinct keys across groups. - TestIndexByResource_NoCrossGroupCollision: lookup per-group returns only its own findings. - TestGroupForBuiltinKind: the table. Closes #35. --- internal/issues/issues.go | 27 +++++++++++- internal/server/ai_handlers.go | 23 +++++++--- internal/server/audit_handlers.go | 10 +++-- pkg/audit/checks.go | 11 ++++- pkg/audit/helpers.go | 56 ++++++++++++++++++++---- pkg/audit/helpers_test.go | 71 +++++++++++++++++++++++++++++++ pkg/audit/types.go | 10 +++++ 7 files changed, 188 insertions(+), 20 deletions(-) create mode 100644 pkg/audit/helpers_test.go diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 2ed01b2bb..4a3764dae 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -303,6 +303,21 @@ func condTypeReason(condType, reason string) string { // Source-specific normalization // --------------------------------------------------------------------------- +// resolveGroup returns the explicit group if set, else falls back to the +// built-in (Kind→Group) table. Some legacy Problem emission sites in +// k8s.DetectProblems still leave Group="" for built-in workloads +// (Deployment, StatefulSet, etc.) — without this fallback, the +// group-aware consumer (computeIssueSummaryForResource) would silently +// drop those rows when looking up by canonical group like "apps". +// Centralised here so the (Kind→Group) map lives in one place across +// packages (pkg/audit owns the table; this is a pass-through). +func resolveGroup(group, kind string) string { + if group != "" { + return group + } + return bp.GroupForBuiltinKind(kind) +} + func fromProblem(p k8s.Problem, now time.Time) Issue { sev := SeverityWarning if p.Severity == "critical" { @@ -313,7 +328,7 @@ func fromProblem(p k8s.Problem, now time.Time) Issue { Severity: sev, Source: SourceProblem, Kind: p.Kind, - Group: p.Group, + Group: resolveGroup(p.Group, p.Kind), Namespace: p.Namespace, Name: p.Name, Reason: p.Reason, @@ -333,6 +348,7 @@ func fromAudit(fin bp.Finding, now time.Time) Issue { Severity: sev, Source: SourceAudit, Kind: fin.Kind, + Group: resolveGroup(fin.Group, fin.Kind), Namespace: fin.Namespace, Name: fin.Name, Reason: fin.CheckID, @@ -413,10 +429,19 @@ func fromWarningEvent(e *corev1.Event) Issue { if first.IsZero() { first = last } + // Event.InvolvedObject carries apiVersion (group/version); split out + // the group so cross-group consumers don't collide when a Knative + // Service and a core Service share name+ns. + group, _, _ := strings.Cut(e.InvolvedObject.APIVersion, "/") + if e.InvolvedObject.APIVersion != "" && !strings.Contains(e.InvolvedObject.APIVersion, "/") { + // "v1" → core group "". + group = "" + } return Issue{ Severity: SeverityWarning, Source: SourceEvent, Kind: e.InvolvedObject.Kind, + Group: resolveGroup(group, e.InvolvedObject.Kind), Namespace: e.Namespace, Name: e.InvolvedObject.Name, Reason: e.Reason, diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index cd4a0f454..2c3d74fa0 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -333,13 +333,15 @@ func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kin // so the audit index lookup keys correctly. Falls back to the URL kind // only when TypeMeta is somehow empty; non-canonical input there would // silently mis-key the audit lookup. - canonicalKind := obj.GetObjectKind().GroupVersionKind().Kind + gvk := obj.GetObjectKind().GroupVersionKind() + canonicalKind := gvk.Kind if canonicalKind == "" { canonicalKind = kind } + canonicalGroup := gvk.Group - issueSum := computeIssueSummaryForResource(cache, canonicalKind, namespace, name) - auditSum := computeAuditSummaryForResource(cache, canonicalKind, namespace, name) + issueSum := computeIssueSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) + auditSum := computeAuditSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) opts := resourcecontext.Options{ Tier: resourcecontext.TierBasic, @@ -414,7 +416,7 @@ func (s *Server) topologyForContext(namespace string) (*topology.Topology, topol // summary silently collapses to nil. // // Returns nil when no issues match — Build then omits the IssueSummary field. -func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.IssueSummary { +func computeIssueSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.IssueSummary { if cache == nil { return nil } @@ -440,6 +442,15 @@ func computeIssueSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n if namespace != "" && row.Namespace != namespace { continue } + // Group-aware match: T11 populates Issue.Group for problem + + // condition sources, so a Knative serving.knative.dev/Service + // lookup won't pull in the core Service's issues (or vice + // versa). The fromAudit / fromEvent sources emit Group="" today + // — those correctly match only the core-group lookup, which is + // the existing behavior. + if row.Group != group { + continue + } matched = append(matched, row) bySource[string(row.Source)]++ } @@ -490,7 +501,7 @@ func composeSeverityRank(s issues.Severity) int { // CheckID as the ascending tiebreaker. Map iteration ordering does NOT // influence the choice — agents pinning regression tests on // resourceContext output rely on stable field values across runs. -func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, name string) *resourcecontext.AuditSummary { +func computeAuditSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.AuditSummary { if cache == nil || kind == "" { return nil } @@ -508,7 +519,7 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, kind, namespace, n return nil } idx := bpaudit.IndexByResource(results.Findings) - match := idx[bpaudit.ResourceKey(kind, namespace, name)] + match := idx[bpaudit.ResourceKey(group, kind, namespace, name)] if len(match) == 0 { return nil } diff --git a/internal/server/audit_handlers.go b/internal/server/audit_handlers.go index edf22d2d8..5465e638d 100644 --- a/internal/server/audit_handlers.go +++ b/internal/server/audit_handlers.go @@ -130,12 +130,16 @@ func (s *Server) handleAuditResource(w http.ResponseWriter, r *http.Request) { results = applyAuditSettings(results, getAuditConfig()) index := bp.IndexByResource(results.Findings) - // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment") - findings := index[bp.ResourceKey(kind, namespace, name)] + // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment"). + // This handler is the UI's per-resource audit drill-down — group isn't on + // the URL today (the UI doesn't list grouped CRDs here yet), so we look + // up with group="" which matches the built-ins the audit suite scans. + // When CRD audit lands (#35 follow-up), thread group through the URL. + findings := index[bp.ResourceKey("", kind, namespace, name)] if findings == nil { goKind := apiResourceToKind(kind) if goKind != kind { - findings = index[bp.ResourceKey(goKind, namespace, name)] + findings = index[bp.ResourceKey("", goKind, namespace, name)] } } if findings == nil { diff --git a/pkg/audit/checks.go b/pkg/audit/checks.go index 1e19dacb9..c61120190 100644 --- a/pkg/audit/checks.go +++ b/pkg/audit/checks.go @@ -935,6 +935,15 @@ func buildResults(findings []Finding) *ScanResults { categories[cat] = CategorySummary{} } + // Populate Group from the built-in (Kind→Group) table. Check emission + // sites leave Group="" so the per-check code stays terse — single + // point of truth here instead of every Finding{} literal. + for i := range findings { + if findings[i].Group == "" { + findings[i].Group = GroupForBuiltinKind(findings[i].Kind) + } + } + // Merge findings: same (resource, checkID) get combined into one finding // with messages joined, so multi-container workloads show all affected containers. type checkKey struct{ resource, checkID string } @@ -942,7 +951,7 @@ func buildResults(findings []Finding) *ScanResults { var dedupFindings []Finding for _, f := range findings { - key := checkKey{ResourceKey(f.Kind, f.Namespace, f.Name), f.CheckID} + key := checkKey{ResourceKey(f.Group, f.Kind, f.Namespace, f.Name), f.CheckID} if idx, exists := mergeIndex[key]; exists { dedupFindings[idx].Message += "; " + f.Message continue diff --git a/pkg/audit/helpers.go b/pkg/audit/helpers.go index aee6ebea1..681a5eaa2 100644 --- a/pkg/audit/helpers.go +++ b/pkg/audit/helpers.go @@ -6,19 +6,23 @@ import ( "strings" ) -// ResourceKey returns the index key for a resource: "Kind/namespace/name". -func ResourceKey(kind, namespace, name string) string { - if namespace == "" { - return fmt.Sprintf("%s//%s", kind, name) - } - return fmt.Sprintf("%s/%s/%s", kind, namespace, name) +// ResourceKey returns the index key for a resource: +// "group|Kind|namespace|name". Group goes first because both group and +// namespace can legitimately be empty independently — encoding group +// last would leave a cluster-scoped CRD key ambiguous with a +// namespaced core-group key under any 3-part parse. "|" is a safe +// delimiter — Kubernetes API groups follow DNS subdomain rules and +// can't contain it. Mirrors the same shape as the issue-source key in +// internal/summarycontext. +func ResourceKey(group, kind, namespace, name string) string { + return fmt.Sprintf("%s|%s|%s|%s", group, kind, namespace, name) } // IndexByResource builds a lookup map from ResourceKey → []Finding. func IndexByResource(findings []Finding) map[string][]Finding { m := make(map[string][]Finding) for _, f := range findings { - key := ResourceKey(f.Kind, f.Namespace, f.Name) + key := ResourceKey(f.Group, f.Kind, f.Namespace, f.Name) m[key] = append(m[key], f) } return m @@ -33,6 +37,7 @@ func GroupByResource(findings []Finding) []ResourceGroup { for _, fs := range index { g := ResourceGroup{ Kind: fs[0].Kind, + Group: fs[0].Group, Namespace: fs[0].Namespace, Name: fs[0].Name, Findings: fs, @@ -55,13 +60,46 @@ func GroupByResource(findings []Finding) []ResourceGroup { if groups[i].Warning != groups[j].Warning { return groups[i].Warning > groups[j].Warning } - return ResourceKey(groups[i].Kind, groups[i].Namespace, groups[i].Name) < - ResourceKey(groups[j].Kind, groups[j].Namespace, groups[j].Name) + return ResourceKey(groups[i].Group, groups[i].Kind, groups[i].Namespace, groups[i].Name) < + ResourceKey(groups[j].Group, groups[j].Kind, groups[j].Namespace, groups[j].Name) }) return groups } +// GroupForBuiltinKind maps a built-in Kubernetes Kind to the API group +// the audit suite scans it under. Returns "" for kinds the suite +// doesn't recognize — those don't get a populated Finding.Group, which +// means cross-group collision risk is bounded to the listed built-ins +// vs. third-party CRDs sharing the same Kind name. +// +// Kept here (next to ResourceKey) so the Kind→Group mapping lives in +// one place rather than every Finding{} emission site. buildResults +// populates Finding.Group via this helper before the index is built; +// per-check code stays terse and group-agnostic. +// +// Also reused by internal/issues to resolve Group on Problem-sourced +// rows that pre-date group-aware emission — keeps the (Kind→Group) +// table in one place across packages. +func GroupForBuiltinKind(kind string) string { + switch kind { + case "Pod", "Service", "ConfigMap", "Secret", "Node", "Namespace", + "PersistentVolume", "PersistentVolumeClaim", "ServiceAccount": + return "" + case "Deployment", "DaemonSet", "StatefulSet", "ReplicaSet": + return "apps" + case "Job", "CronJob": + return "batch" + case "HorizontalPodAutoscaler": + return "autoscaling" + case "Ingress", "NetworkPolicy": + return "networking.k8s.io" + case "PodDisruptionBudget": + return "policy" + } + return "" +} + // ApplySettings filters audit results based on ignored namespaces (with wildcard // patterns like *-system) and disabled checks. This is the shared implementation // used by all consumers (HTTP handlers, MCP, skyhook-connector). diff --git a/pkg/audit/helpers_test.go b/pkg/audit/helpers_test.go new file mode 100644 index 000000000..7ee965f71 --- /dev/null +++ b/pkg/audit/helpers_test.go @@ -0,0 +1,71 @@ +package audit + +import "testing" + +// TestResourceKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups produce distinct +// keys. Pre-fix, ResourceKey was group-blind: a Knative +// serving.knative.dev/Service "api" in "prod" collided with the core +// "" /Service "api" in "prod", and IndexByResource would conflate +// their findings (whichever Finding came last would shadow the other +// in the dedup checkKey, and any lookup by ResourceKey returned the +// pooled set). The fix routes Group through the key. +func TestResourceKey_GroupAware(t *testing.T) { + core := ResourceKey("", "Service", "prod", "api") + knative := ResourceKey("serving.knative.dev", "Service", "prod", "api") + if core == knative { + t.Fatalf("ResourceKey collides across groups: %q == %q", core, knative) + } +} + +// TestIndexByResource_NoCrossGroupCollision exercises the same fix +// end-to-end: emit two Findings for kind/ns/name "Service/prod/api", +// one with Group="" (core) and one with Group="serving.knative.dev" +// (Knative), and verify each lookup returns ONLY its own finding — +// not the union. +func TestIndexByResource_NoCrossGroupCollision(t *testing.T) { + findings := []Finding{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", CheckID: "core-finding"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", CheckID: "knative-finding"}, + } + idx := IndexByResource(findings) + + core := idx[ResourceKey("", "Service", "prod", "api")] + if len(core) != 1 || core[0].CheckID != "core-finding" { + t.Errorf("core lookup: got %+v, want 1 finding with CheckID=core-finding", core) + } + knative := idx[ResourceKey("serving.knative.dev", "Service", "prod", "api")] + if len(knative) != 1 || knative[0].CheckID != "knative-finding" { + t.Errorf("knative lookup: got %+v, want 1 finding with CheckID=knative-finding", knative) + } +} + +// TestGroupForBuiltinKind pins the (Kind→Group) table used by +// buildResults to populate Finding.Group for emission sites that leave +// it empty. Centralising the table here keeps per-check code terse; +// drift between this table and the actual API group a check scans +// would silently mis-key findings. +func TestGroupForBuiltinKind(t *testing.T) { + cases := map[string]string{ + "Pod": "", + "Service": "", + "ConfigMap": "", + "Secret": "", + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "ReplicaSet": "apps", + "Job": "batch", + "CronJob": "batch", + "HorizontalPodAutoscaler": "autoscaling", + "Ingress": "networking.k8s.io", + "NetworkPolicy": "networking.k8s.io", + "PodDisruptionBudget": "policy", + "UnknownCRD": "", + } + for kind, want := range cases { + if got := GroupForBuiltinKind(kind); got != want { + t.Errorf("GroupForBuiltinKind(%q) = %q, want %q", kind, got, want) + } + } +} diff --git a/pkg/audit/types.go b/pkg/audit/types.go index a5b8f91f4..4d7d63da3 100644 --- a/pkg/audit/types.go +++ b/pkg/audit/types.go @@ -63,8 +63,12 @@ type ScanResults struct { // ResourceGroup aggregates findings for a single resource. // Groups are sorted by severity (danger first), then by name. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. type ResourceGroup struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` Warning int `json:"warning"` @@ -88,8 +92,14 @@ type CategorySummary struct { } // Finding represents a single best-practice violation. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. Check emission sites leave Group="" — buildResults +// populates it via groupForBuiltinKind so the (Kind→Group) map lives +// in one place rather than every check function. type Finding struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` CheckID string `json:"checkID"` From 7a1f006de8a5ce791f7fb146af0e45e6520a95e8 Mon Sep 17 00:00:00 2001 From: Nadav Erell Date: Tue, 19 May 2026 01:23:18 +0300 Subject: [PATCH 17/17] fix(resourcecontext): normalize audit severity to issue vocabulary on the wire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit auditSummary.highestSeverity used to emit raw audit vocabulary ("danger" / "warning") while sibling issueSummary.highestSeverity emits issue vocabulary ("critical" / "warning"). Two fields under the same resourceContext disagreeing on what "highest severity" means is a real wire-shape footgun — consumers parsing one will mis-handle the other. Mirrors the same mapping internal/issues.fromAudit applies when audit findings flow through the unified issue stream, so the two paths now agree. Empty / future audit severities pass through unchanged so the contract stays explicit if pkg/audit grows new values. Pinned by TestNormalizeAuditSeverity. --- internal/server/ai_handlers.go | 22 ++++++++++++++-- internal/server/ai_handlers_severity_test.go | 27 ++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 internal/server/ai_handlers_severity_test.go diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 2c3d74fa0..10593ccfd 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -533,15 +533,33 @@ func computeAuditSummaryForResource(cache *k8s.ResourceCache, group, kind, names } return match[i].CheckID < match[j].CheckID }) - topSeverity := match[0].Severity topFinding := match[0].CheckID return &resourcecontext.AuditSummary{ Count: len(match), - HighestSeverity: topSeverity, + HighestSeverity: normalizeAuditSeverity(match[0].Severity), TopFinding: topFinding, } } +// normalizeAuditSeverity maps the audit suite's emission vocabulary +// ("danger" / "warning") onto the unified resourceContext severity +// scale ("critical" / "warning") used by issueSummary. Two sibling +// fields in the same response reporting severity in different +// vocabularies — "danger" vs "critical" — is a wire-shape footgun for +// consumers. Mirrors the same mapping internal/issues.fromAudit +// applies when audit findings flow through the unified issue stream. +// Empty / unknown severities pass through unchanged so the contract +// stays explicit if the audit suite ever grows new values. +func normalizeAuditSeverity(s string) string { + switch s { + case bpaudit.SeverityDanger: + return string(issues.SeverityCritical) + case bpaudit.SeverityWarning: + return string(issues.SeverityWarning) + } + return s +} + // auditSeverityRank orders audit finding severities ("danger" > "warning"). func auditSeverityRank(s string) int { switch s { diff --git a/internal/server/ai_handlers_severity_test.go b/internal/server/ai_handlers_severity_test.go new file mode 100644 index 000000000..abeadcef8 --- /dev/null +++ b/internal/server/ai_handlers_severity_test.go @@ -0,0 +1,27 @@ +package server + +import ( + "testing" + + "github.com/skyhook-io/radar/internal/issues" + bpaudit "github.com/skyhook-io/radar/pkg/audit" +) + +// Pin the audit→issue severity normalization on the AuditSummary wire. +// Without it, sibling resourceContext fields disagree on what "highest +// severity" means: audit emits "danger" while issueSummary emits +// "critical". Mirror the same mapping internal/issues.fromAudit uses +// for the unified issue stream so consumers see one vocabulary. +func TestNormalizeAuditSeverity(t *testing.T) { + cases := map[string]string{ + bpaudit.SeverityDanger: string(issues.SeverityCritical), + bpaudit.SeverityWarning: string(issues.SeverityWarning), + "": "", // empty stays empty — explicit contract + "unknown": "unknown", // future audit values pass through + } + for in, want := range cases { + if got := normalizeAuditSeverity(in); got != want { + t.Errorf("normalizeAuditSeverity(%q) = %q, want %q", in, got, want) + } + } +}