diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 2ed01b2bb..5bb4f6279 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -74,10 +74,17 @@ func Compose(p Provider, f Filters) []Issue { // severity desc, then last-seen desc, then kind/ns/name for stable // tiebreaks. func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { + // Negative Limit is the "uncapped" sentinel: callers that need the + // full matched set (per-resource issue indexes for /api/ai list + + // search summaryContext) pass NoLimit so a 5000-issue cluster + // doesn't silently drop counts for resources whose issues fall in + // the tail beyond MaxLimit. Zero still maps to DefaultLimit so the + // public /api/issues + MCP issues_list keep their tight caps. + uncapped := f.Limit < 0 if f.Limit == 0 { f.Limit = DefaultLimit } - if f.Limit > MaxLimit { + if !uncapped && f.Limit > MaxLimit { f.Limit = MaxLimit } @@ -201,7 +208,7 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { return out[i].Name < out[j].Name }) stats.TotalMatched = len(out) - if len(out) > f.Limit { + if !uncapped && len(out) > f.Limit { out = out[:f.Limit] } return out, stats @@ -211,11 +218,24 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { // warning Issue for each object that has a False Ready/Available/etc. // condition. Skips kinds owned by curated checkers (Cluster API today) // to avoid double-reporting. +// +// When f.Kinds is non-empty (e.g. summaryContext building a per-resource +// issue index for a list_resources call on a single kind), GVRs whose +// kind isn't in the filter are skipped BEFORE the ListDynamic call — +// without this gate, a pods-only request still scanned every watched +// CRD up front and applyFilters discarded the rows afterward. Kind +// comparison mirrors applyFilters: lowercase for case-insensitive +// match against the user's filter (which itself is canonicalized to +// the singular form upstream). func detectGenericCRDIssues(p Provider, f Filters) []Issue { gvrs := p.WatchedDynamic() if len(gvrs) == 0 { return nil } + wantKind := map[string]bool{} + for _, k := range f.Kinds { + wantKind[strings.ToLower(k)] = true + } var out []Issue for _, gvr := range gvrs { if isCuratedCRDGroup(gvr.Group) { @@ -225,6 +245,15 @@ func detectGenericCRDIssues(p Provider, f Filters) []Issue { if kind == "" { continue } + // applyFilters runs after Compose returns — but on hot paths that + // pin a single kind (summaryContext per-row index), routing the + // kind filter through here skips the per-GVR ListDynamic call + // entirely. Match in lowercase (same as applyFilters) so + // "Pod"/"pod" and CRD-typed "MyResource"/"myresource" both + // compare equal. + if len(wantKind) > 0 && !wantKind[strings.ToLower(kind)] { + continue + } clusterScoped, _, _ := classifyDynamicScope(p, gvr, kind) if clusterScoped && f.CanReadClusterScoped != nil && !f.CanReadClusterScoped(kind, gvr.Group) { continue @@ -303,6 +332,21 @@ func condTypeReason(condType, reason string) string { // Source-specific normalization // --------------------------------------------------------------------------- +// resolveGroup returns the explicit group if set, else falls back to the +// built-in (Kind→Group) table. Some legacy Problem emission sites in +// k8s.DetectProblems still leave Group="" for built-in workloads +// (Deployment, StatefulSet, etc.) — without this fallback, the +// group-aware consumer (computeIssueSummaryForResource) would silently +// drop those rows when looking up by canonical group like "apps". +// Centralised here so the (Kind→Group) map lives in one place across +// packages (pkg/audit owns the table; this is a pass-through). +func resolveGroup(group, kind string) string { + if group != "" { + return group + } + return bp.GroupForBuiltinKind(kind) +} + func fromProblem(p k8s.Problem, now time.Time) Issue { sev := SeverityWarning if p.Severity == "critical" { @@ -313,7 +357,7 @@ func fromProblem(p k8s.Problem, now time.Time) Issue { Severity: sev, Source: SourceProblem, Kind: p.Kind, - Group: p.Group, + Group: resolveGroup(p.Group, p.Kind), Namespace: p.Namespace, Name: p.Name, Reason: p.Reason, @@ -333,6 +377,7 @@ func fromAudit(fin bp.Finding, now time.Time) Issue { Severity: sev, Source: SourceAudit, Kind: fin.Kind, + Group: resolveGroup(fin.Group, fin.Kind), Namespace: fin.Namespace, Name: fin.Name, Reason: fin.CheckID, @@ -413,10 +458,19 @@ func fromWarningEvent(e *corev1.Event) Issue { if first.IsZero() { first = last } + // Event.InvolvedObject carries apiVersion (group/version); split out + // the group so cross-group consumers don't collide when a Knative + // Service and a core Service share name+ns. + group, _, _ := strings.Cut(e.InvolvedObject.APIVersion, "/") + if e.InvolvedObject.APIVersion != "" && !strings.Contains(e.InvolvedObject.APIVersion, "/") { + // "v1" → core group "". + group = "" + } return Issue{ Severity: SeverityWarning, Source: SourceEvent, Kind: e.InvolvedObject.Kind, + Group: resolveGroup(group, e.InvolvedObject.Kind), Namespace: e.Namespace, Name: e.InvolvedObject.Name, Reason: e.Reason, diff --git a/internal/issues/issues_test.go b/internal/issues/issues_test.go index 2e774746a..beb995725 100644 --- a/internal/issues/issues_test.go +++ b/internal/issues/issues_test.go @@ -622,3 +622,87 @@ func TestFlattenNamespacedProblems_EmptyInputReturnsNil(t *testing.T) { t.Errorf("empty input should produce empty output, got %+v", out) } } + +// countingProvider wraps fakeProvider and tallies ListDynamic calls per +// GVR. Used by TestDetectGenericCRDIssues_SkipsListWhenKindFiltered to +// pin that detectGenericCRDIssues short-circuits the per-GVR +// ListDynamic call when f.Kinds excludes the GVR's kind — on clusters +// with hundreds of watched CRDs, scanning every one for a pods-only +// summaryContext request was the dominant cost. +type countingProvider struct { + fakeProvider + listCalls map[schema.GroupVersionResource]int +} + +func (c *countingProvider) ListDynamic(gvr schema.GroupVersionResource, ns string) ([]*unstructured.Unstructured, error) { + if c.listCalls == nil { + c.listCalls = map[schema.GroupVersionResource]int{} + } + c.listCalls[gvr]++ + return c.fakeProvider.ListDynamic(gvr, ns) +} + +// TestDetectGenericCRDIssues_SkipsListWhenKindFiltered pins the +// "scan all CRDs before kindFilter applies" perf fix in +// detectGenericCRDIssues. Pre-fix, a Compose call with Kinds=["Pod"] +// still iterated every watched CRD GVR and ran ListDynamic on each; +// applyFilters then discarded the non-matching rows at the end. +// +// On a cluster with hundreds of watched CRDs this dominated the +// summaryContext per-row index build for list_resources kind=pods. +// The fix routes f.Kinds awareness into detectGenericCRDIssues so +// non-matching GVRs skip the ListDynamic call entirely. +func TestDetectGenericCRDIssues_SkipsListWhenKindFiltered(t *testing.T) { + podGVR := schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} + appGVR := schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "applications"} + npGVR := schema.GroupVersionResource{Group: "karpenter.sh", Version: "v1", Resource: "nodepools"} + + p := &countingProvider{ + fakeProvider: fakeProvider{ + dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{ + podGVR: {}, // empty — only counts the call. + appGVR: {{Object: map[string]any{ + "metadata": map[string]any{"name": "a", "namespace": "argocd"}, + "status": map[string]any{ + "conditions": []any{ + map[string]any{"type": "Synced", "status": "False", "reason": "Drift"}, + }, + }, + }}}, + npGVR: {}, // empty — only counts the call. + }, + kinds: map[schema.GroupVersionResource]string{ + podGVR: "Pod", + appGVR: "Application", + npGVR: "NodePool", + }, + }, + } + + // kindFilter restricts to Application — the other two GVRs must NOT + // be listed. detectGenericCRDIssues lowercases the kind comparison + // (mirrors applyFilters), so the canonical "Application" matches the + // emitted Kind for the argoproj.io GVR. + _ = detectGenericCRDIssues(p, Filters{Kinds: []string{"Application"}}) + + if got := p.listCalls[podGVR]; got != 0 { + t.Errorf("Pod GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[npGVR]; got != 0 { + t.Errorf("NodePool GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[appGVR]; got == 0 { + t.Errorf("Application GVR ListDynamic calls = %d, want >= 1 (matching kind must still be scanned)", got) + } + + // Sanity: empty Kinds filter scans every GVR (no per-kind shortcut + // when caller didn't ask for one). Pins that the fix is filter-aware + // rather than always-skip. + p.listCalls = nil + _ = detectGenericCRDIssues(p, Filters{}) + for gvr, want := range map[schema.GroupVersionResource]bool{podGVR: true, appGVR: true, npGVR: true} { + if got := p.listCalls[gvr] > 0; got != want { + t.Errorf("no kind filter: GVR %s called=%v, want %v", gvr.Resource, got, want) + } + } +} diff --git a/internal/issues/types.go b/internal/issues/types.go index 11b368433..63b22a38c 100644 --- a/internal/issues/types.go +++ b/internal/issues/types.go @@ -123,4 +123,11 @@ type Filters struct { const ( DefaultLimit = 200 MaxLimit = 1000 + // NoLimit disables the result cap. Pass as Filters.Limit when the + // caller needs the full matched set (e.g. building a per-resource + // issue index for summaryContext — capping there would silently zero + // out counts for resources whose issues fall in the tail beyond + // MaxLimit on large clusters). Stats.TotalMatched is reliable + // regardless; this just turns off the post-sort slice. + NoLimit = -1 ) diff --git a/internal/k8s/problems.go b/internal/k8s/problems.go index f5e19d452..766f98f34 100644 --- a/internal/k8s/problems.go +++ b/internal/k8s/problems.go @@ -59,6 +59,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d available", d.Status.AvailableReplicas, d.Status.Replicas), Age: FormatAge(ageDur), @@ -78,6 +79,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: "Rollout stuck", Message: cond.Message, @@ -107,6 +109,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "StatefulSet", Namespace: ss.Namespace, Name: ss.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d ready", ss.Status.ReadyReplicas, ss.Status.Replicas), Age: FormatAge(ageDur), @@ -133,6 +136,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "DaemonSet", Namespace: ds.Namespace, Name: ds.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d unavailable", ds.Status.NumberUnavailable), Age: FormatAge(ageDur), @@ -157,6 +161,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "HorizontalPodAutoscaler", Namespace: hp.Namespace, Name: hp.Name, + Group: "autoscaling", Severity: "medium", Reason: hp.Problem, Message: hp.Reason, @@ -177,6 +182,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "CronJob", Namespace: cp.Namespace, Name: cp.Name, + Group: "batch", Severity: "medium", Reason: cp.Problem, Message: cp.Reason, @@ -251,6 +257,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Job", Namespace: job.Namespace, Name: job.Name, + Group: "batch", Severity: "high", Reason: fmt.Sprintf("Running for %s with no completions", FormatAge(ageDur)), Age: FormatAge(ageDur), diff --git a/internal/k8s/problems_test.go b/internal/k8s/problems_test.go new file mode 100644 index 000000000..779b74f4f --- /dev/null +++ b/internal/k8s/problems_test.go @@ -0,0 +1,144 @@ +package k8s + +import ( + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// TestDetectProblems_PopulatesGroup pins that every built-in Problem +// emitted by DetectProblems carries the correct canonical API group. +// +// The summary_context issue index keys per-resource counts as +// "group|kind|ns|name" — a Problem with an empty Group collides with +// no real bucket, silently zeroing issueCount for that workload row. +// Pre-fix, all the built-in append-Problem sites omitted the field, so +// every broken Deployment/StatefulSet/DaemonSet/HPA/CronJob/Job +// reported issueCount: 0 in the AI list envelope — a regression +// against the pre-group-aware behavior. +// +// Construct one broken object per built-in kind, drive DetectProblems +// against a fake client, and assert each emitted Problem's Group +// matches the canonical group for its kind. +func TestDetectProblems_PopulatesGroup(t *testing.T) { + defer ResetTestState() + + oneReplica := int32(1) + minReplicas := int32(1) + now := time.Now() + // Job needs to be older than 1h to surface a "stuck" problem. + jobStart := metav1.NewTime(now.Add(-2 * time.Hour)) + + client := fake.NewClientset( + // Deployment with unavailable replicas — triggers the + // "X/Y available" Problem branch. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: appsv1.DeploymentSpec{Replicas: &oneReplica}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UnavailableReplicas: 1, + }, + }, + // StatefulSet with readyReplicas < replicas. + &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "prod"}, + Spec: appsv1.StatefulSetSpec{Replicas: &oneReplica}, + Status: appsv1.StatefulSetStatus{ + Replicas: 1, + ReadyReplicas: 0, + }, + }, + // DaemonSet with numberUnavailable > 0. + &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Name: "logger", Namespace: "prod"}, + Status: appsv1.DaemonSetStatus{ + NumberUnavailable: 2, + }, + }, + // HPA at its replica ceiling — DetectHPAProblems flags + // "maxed" when current and desired both hit MaxReplicas. + // The wrapper sets Group="autoscaling". + &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MinReplicas: &minReplicas, + MaxReplicas: 10, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + }, + }, + // Job stuck Active>0 for >1h with no completions. + &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod", CreationTimestamp: jobStart}, + Status: batchv1.JobStatus{ + Active: 1, + Succeeded: 0, + Failed: 0, + }, + }, + ) + + if err := InitTestResourceCache(client); err != nil { + t.Fatalf("InitTestResourceCache: %v", err) + } + cache := GetResourceCache() + if cache == nil { + t.Fatal("cache nil after init") + } + + // Allow informers a brief moment to populate. The fake clientset + // pre-seeds the store, but the lister types reconstruct via + // informer events on a separate goroutine. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if hasAllProblemTypes(DetectProblems(cache, "prod")) { + break + } + time.Sleep(20 * time.Millisecond) + } + + problems := DetectProblems(cache, "prod") + + wantGroup := map[string]string{ + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "HorizontalPodAutoscaler": "autoscaling", + "Job": "batch", + } + + got := make(map[string]string, len(problems)) + for _, p := range problems { + // One Problem per kind is enough for the Group assertion; + // duplicates (e.g. Deployment Available + ProgressDeadline) + // must agree on Group so the last-write-wins shape is fine. + got[p.Kind] = p.Group + } + + for kind, want := range wantGroup { + gotGroup, ok := got[kind] + if !ok { + t.Errorf("no Problem emitted for %s — fixture wiring broken; got %d problems: %+v", kind, len(problems), problems) + continue + } + if gotGroup != want { + t.Errorf("%s.Group = %q, want %q (summary_context index keys by group — empty Group zeros issueCount)", kind, gotGroup, want) + } + } +} + +func hasAllProblemTypes(problems []Problem) bool { + seen := map[string]bool{} + for _, p := range problems { + seen[p.Kind] = true + } + return seen["Deployment"] && seen["StatefulSet"] && seen["DaemonSet"] && seen["HorizontalPodAutoscaler"] && seen["Job"] +} diff --git a/internal/k8s/testing.go b/internal/k8s/testing.go index 799b2811e..560cbf632 100644 --- a/internal/k8s/testing.go +++ b/internal/k8s/testing.go @@ -4,7 +4,9 @@ import ( "sync" "github.com/skyhook-io/radar/pkg/k8score" + "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + fakeclientset "k8s.io/client-go/kubernetes/fake" ) // InitTestResourceCache creates a resource cache from a fake or test client, @@ -68,6 +70,56 @@ func InitTestResourceCache(client kubernetes.Interface) error { return nil } +// InitTestDynamicResourceCache wires the dynamic resource cache and discovery +// singletons against test fakes. Pass a dynamic client (typically from +// dynamicfake.NewSimpleDynamicClientWithCustomListKinds) and the set of +// APIResources to register in discovery. Each registered resource gets a GVR +// entry that group-qualified lookups (GetGVRWithGroup) and dynamic informers +// can resolve. +// +// Callers should defer ResetTestDynamicState — without it, the dynamic +// singletons leak into other tests that share TestMain state. +// +// This is intended for integration tests only. +func InitTestDynamicResourceCache(dynClient dynamic.Interface, resources []APIResource) error { + clientMu.Lock() + dynamicClient = dynClient + clientMu.Unlock() + + // Bootstrap discovery from a fake clientset so NewResourceDiscovery has a + // non-nil discovery client; AddAPIResource then registers the test-only + // GVRs (e.g. serving.knative.dev/Service) the test depends on. + fakeDisc := fakeclientset.NewSimpleClientset().Discovery() + core, err := k8score.NewResourceDiscovery(fakeDisc) + if err != nil { + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() + return err + } + for _, r := range resources { + core.AddAPIResource(r) + } + + discoveryMu.Lock() + resourceDiscovery = &ResourceDiscovery{ResourceDiscovery: core} + discoveryOnce = new(sync.Once) + discoveryOnce.Do(func() {}) + discoveryMu.Unlock() + + return InitDynamicResourceCache(nil) +} + +// ResetTestDynamicState tears down the dynamic cache + discovery singletons +// and clears the dynamic client. Pairs with InitTestDynamicResourceCache. +func ResetTestDynamicState() { + ResetDynamicResourceCache() + ResetResourceDiscovery() + clientMu.Lock() + dynamicClient = nil + clientMu.Unlock() +} + // SetTestContextName is a test-only helper that overrides the package-level // kubeconfig context name. Used by tests that exercise per-context state // (e.g. namespace preferences) without needing to spin up a real client. diff --git a/internal/mcp/rc_rbac.go b/internal/mcp/rc_rbac.go new file mode 100644 index 000000000..393c26ee3 --- /dev/null +++ b/internal/mcp/rc_rbac.go @@ -0,0 +1,91 @@ +package mcp + +import ( + "context" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// requestScopedChecker adapts the MCP-side RBAC helpers +// (canReadInNamespace / canReadClusterScopedKind) into +// resourcecontext.RefAccessChecker with a request-local memoization layer +// keyed on (verb, group, kind, namespace). +// +// A single resourceContext build emits ~30 candidate refs but only ~5 +// distinct (group, kind, namespace) tuples — caching here collapses the +// SAR fan-out before reaching the inner per-user cache. Mirrors the REST +// equivalent in internal/server/rc_rbac.go so the two surfaces share the +// same enforcement story. +// +// Request-scoped (not server-scoped): per-user caching already lives one +// layer down. This layer only deduplicates the burst a single Build +// invocation generates. +type requestScopedChecker struct { + ctx context.Context + cache map[string]bool +} + +// newMCPRequestScopedChecker returns a checker scoped to a single MCP +// tool call. Not safe for concurrent use across calls. +func newMCPRequestScopedChecker(ctx context.Context) *requestScopedChecker { + return &requestScopedChecker{ + ctx: ctx, + cache: make(map[string]bool, 8), + } +} + +// CanRead implements resourcecontext.RefAccessChecker. +// +// Authorization rules mirror the REST adapter: +// - Namespaced kinds: SAR on (verb=get, group, resource, namespace). +// - Cluster-scoped kinds (namespace == ""): SAR on (verb=get, group, +// resource, ""). +// - Unknown kinds (not in discovery, not in static catalogue) pass +// through — Build only emits refs whose kinds are known to the +// topology builder, and an unknown kind here is a temporary +// discovery-cold state, not a permission bypass vector. +func (c *requestScopedChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + key := "get|" + group + "|" + kind + "|" + namespace + if v, ok := c.cache[key]; ok { + return v + } + + resource := lookupResourceName(kind, group) + if resource == "" { + c.cache[key] = true + return true + } + + var allowed bool + if namespace == "" { + allowed = canReadClusterScopedKind(c.ctx, kind, group, "get") + } else { + allowed = canReadInNamespace(c.ctx, group, resource, namespace, "get") + } + c.cache[key] = allowed + return allowed +} + +// Compile-time assertion that requestScopedChecker satisfies the contract. +var _ resourcecontext.RefAccessChecker = (*requestScopedChecker)(nil) + +// lookupResourceName resolves a (kind, group) pair to the canonical plural +// resource name used by SubjectAccessReview. Tries the static cluster-only +// catalogue first (covers Nodes / ClusterRoles / etc.), then discovery for +// everything else including CRDs. Returns "" when neither path knows the +// kind. Mirrors internal/server/rc_rbac.go's helper of the same name. +func lookupResourceName(kind, group string) string { + if kind == "" { + return "" + } + if g, r, ok := k8s.ClusterOnlyKindGVR(kind); ok && (group == "" || group == g) { + return r + } + if disc := k8s.GetResourceDiscovery(); disc != nil { + if ar, ok := disc.GetResourceWithGroup(kind, group); ok { + return ar.Name + } + } + return "" +} diff --git a/internal/mcp/resource_context.go b/internal/mcp/resource_context.go new file mode 100644 index 000000000..6886da284 --- /dev/null +++ b/internal/mcp/resource_context.go @@ -0,0 +1,224 @@ +package mcp + +import ( + "sort" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/audit" + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" + "github.com/skyhook-io/radar/pkg/resourcecontext" + topo "github.com/skyhook-io/radar/pkg/topology" +) + +// mcpPolicyReportLookupAdapter wraps k8s.GetPolicyReportIndex into the +// resourcecontext.PolicyReportLookup interface. Mirrors the REST adapter in +// internal/server/ai_handlers.go — keeping the projection narrow here lets +// pkg/policyreports.Finding evolve without perturbing the wire contract. +type mcpPolicyReportLookupAdapter struct { + idx *policyreports.Index +} + +func (a mcpPolicyReportLookupAdapter) FindingsFor(group, kind, namespace, name string) []resourcecontext.KyvernoFinding { + if a.idx == nil { + return nil + } + findings := a.idx.FindingsFor(group, kind, namespace, name) + if len(findings) == 0 { + return nil + } + out := make([]resourcecontext.KyvernoFinding, len(findings)) + for i, f := range findings { + out[i] = resourcecontext.KyvernoFinding{ + Policy: f.Policy, + Rule: f.Rule, + Result: f.Result, + Message: f.Message, + } + } + return out +} + +type mcpServiceBackendLookup struct { + cache *k8s.ResourceCache +} + +func (l mcpServiceBackendLookup) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + if l.cache == nil || l.cache.Pods() == nil { + return nil, nil + } + return l.cache.Pods().Pods(namespace).List(selector) +} + +// computeMCPIssueSummary rolls up per-resource issue-composer rows +// (problem + condition + optional audit) into an IssueSummary. Mirrors the +// REST handler's computeIssueSummaryForResource — same composer call, same +// group-aware iteration filter, same deterministic sort. The composer's +// native namespace filter restricts the scan to the resource's namespace; +// the per-row group check prevents cross-group collisions where a CRD and +// a built-in share kind+ns+name. +// +// Pascal-singular kind required: the composer's Filters.Kinds matcher +// case-folds both sides but doesn't plural-to-singular convert. Callers +// pass canonicalKind from obj's TypeMeta. +func computeMCPIssueSummary(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.IssueSummary { + if cache == nil { + return nil + } + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + filters := issues.Filters{ + Kinds: []string{kind}, + Limit: issues.MaxLimit, + } + if namespace != "" { + filters.Namespaces = []string{namespace} + } + rows, _ := issues.ComposeWithStats(provider, filters) + + matched := make([]issues.Issue, 0, len(rows)) + bySource := make(map[string]int) + for _, row := range rows { + if row.Name != name { + continue + } + if namespace != "" && row.Namespace != namespace { + continue + } + if row.Group != group { + continue + } + matched = append(matched, row) + bySource[string(row.Source)]++ + } + if len(matched) == 0 { + return nil + } + // (severity desc, Reason asc) — deterministic across runs. + sort.Slice(matched, func(i, j int) bool { + ri, rj := mcpComposeSeverityRank(matched[i].Severity), mcpComposeSeverityRank(matched[j].Severity) + if ri != rj { + return ri > rj + } + return matched[i].Reason < matched[j].Reason + }) + return &resourcecontext.IssueSummary{ + Count: len(matched), + HighestSeverity: string(matched[0].Severity), + TopReason: matched[0].Reason, + BySource: bySource, + } +} + +func mcpComposeSeverityRank(s issues.Severity) int { + switch s { + case issues.SeverityCritical: + return 2 + case issues.SeverityWarning: + return 1 + } + return 0 +} + +// computeMCPAuditSummary looks up audit findings for the subject resource +// via the group-aware (group, Kind, ns, name) key. Mirrors the REST +// handler's computeAuditSummaryForResource. +// +// kind MUST be Pascal singular — the audit check runner writes that into +// Finding.Kind, and Finding.Group is populated by audit.buildResults via +// the built-in (Kind→Group) table, so the lookup keys correctly. +func computeMCPAuditSummary(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil || kind == "" { + return nil + } + var namespaces []string + if namespace != "" { + namespaces = []string{namespace} + } + results := audit.RunFromCache(cache, namespaces, nil) + if results == nil || len(results.Findings) == 0 { + return nil + } + idx := bpaudit.IndexByResource(results.Findings) + match := idx[bpaudit.ResourceKey(group, kind, namespace, name)] + if len(match) == 0 { + return nil + } + + sort.Slice(match, func(i, j int) bool { + ri, rj := mcpAuditSeverityRank(match[i].Severity), mcpAuditSeverityRank(match[j].Severity) + if ri != rj { + return ri > rj + } + return match[i].CheckID < match[j].CheckID + }) + + return &resourcecontext.AuditSummary{ + Count: len(match), + HighestSeverity: mcpNormalizeAuditSeverity(match[0].Severity), + TopFinding: match[0].CheckID, + } +} + +func mcpAuditSeverityRank(s string) int { + switch s { + case bpaudit.SeverityDanger: + return 2 + case bpaudit.SeverityWarning: + return 1 + } + return 0 +} + +// mcpNormalizeAuditSeverity maps the audit suite's emission vocabulary +// ("danger" / "warning") onto the unified resourceContext severity scale +// ("critical" / "warning") used by issueSummary. Two sibling fields in +// the same response reporting severity in different vocabularies is a +// wire-shape footgun — mirror the REST handler's normalizeAuditSeverity. +func mcpNormalizeAuditSeverity(s string) string { + switch s { + case bpaudit.SeverityDanger: + return string(issues.SeverityCritical) + case bpaudit.SeverityWarning: + return string(issues.SeverityWarning) + } + return s +} + +// mcpTopologyForContext returns a per-call topology snapshot scoped to the +// resource's namespace (cluster-scoped resources get an all-namespaces +// build). Reuses the package-level summaryCtxTopoMemo cache to amortize +// build cost across get_resource and list_resources / search calls. nil +// return is fine — Build then skips topology-derived fields and the +// remaining sidecar still populates. +func mcpTopologyForContext(namespace string) (*topo.Topology, topo.ResourceProvider, topo.DynamicProvider, bool) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, nil, false + } + opts := topo.DefaultBuildOptions() + if namespace != "" { + opts.Namespaces = []string{namespace} + } + provider := k8s.NewTopologyResourceProvider(cache) + dyn := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + topology, err := summaryCtxTopoMemo.Get(opts, func() (*topo.Topology, error) { + return topo.NewBuilder(provider).WithDynamic(dyn).Build(opts) + }) + if err != nil || topology == nil { + return nil, nil, nil, false + } + return topology, provider, dyn, true +} + +// _ guards the imports used by Build's Options struct from being marked +// unused if the helpers above ever drop their references during refactors. +var _ = runtime.Object(nil) diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go new file mode 100644 index 000000000..78f36cf0f --- /dev/null +++ b/internal/mcp/summary_context.go @@ -0,0 +1,96 @@ +// Per-request helpers that compute the compact ResourceSummaryContext attached +// to list_resources rows and search hits served via MCP. +// +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the MCP-specific wrapper — it +// sources topology from a short-TTL per-process memoizer (MCP has no +// shared broadcaster cache) and otherwise just plumbs arguments through. + +package mcp + +import ( + "time" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// newResourceSummaryContextBuilder assembles the per-request closure for MCP +// list_resources. Returns nil when the cache or topology isn't +// available, in which case the caller should skip context attachment +// rather than emit empty objects. +// +// namespaces scopes the issue index to just the rows being returned; +// pass nil for cluster-wide. +// +// Use newSearchSummaryContextBuilder for MCP search, which routes +// per-hit between a namespaced and a cluster-wide index — search +// returns mixed kinds in one response, so a single index can't get +// both right. +func newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + idx := summarycontext.BuildIssueIndex(provider, namespaces) + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(namespaces), idx, idx) +} + +// newSearchSummaryContextBuilder is the MCP search variant. Mirrors +// internal/server.newSearchSummaryContextBuilder — see that comment for +// the dual-index rationale (mixed-kind hits, cluster-scoped issues at +// namespace=""). MCP search-level RBAC (CanReadClusterScoped via +// canReadClusterScopedKind) already gates which cluster-scoped kinds +// are reachable, so composing the cluster-wide index doesn't leak +// rows the user can't see. +func newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) + } + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(scanNamespaces), namespacedIdx, clusterIdx) +} + +// summaryCtxTopoMemo caches topology builds across summary-context list and +// search invocations. MCP has no shared broadcaster cache, so without +// memoization every list_resources / search call from an agent pays a +// full topology build (multi-second on multi-thousand-resource clusters). +// 5s TTL matches the REST broadcaster's cadence — short enough that +// managedBy stays current after a context switch, long enough that a +// burst of agent calls amortizes the build cost. +// +// Other MCP tools (handleGetResource, get_neighborhood) still build +// inline; threading them through here is a separate follow-up. +var summaryCtxTopoMemo = topology.NewMemoizer(5 * time.Second) + +// buildSummaryContextTopology returns a topology snapshot suitable for +// resolving managedBy pointers, reusing a cached snapshot when one is +// fresh. Returns nil on failure — the caller falls back to a +// managedBy-less ResourceSummaryContext rather than failing the response. +func buildSummaryContextTopology(namespaces []string) *topology.Topology { + cache := k8s.GetResourceCache() + if cache == nil { + return nil + } + opts := topology.DefaultBuildOptions() + if len(namespaces) > 0 { + opts.Namespaces = namespaces + } + topo, err := summaryCtxTopoMemo.Get(opts, func() (*topology.Topology, error) { + builder := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery())) + return builder.Build(opts) + }) + if err != nil { + return nil + } + return topo +} diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index d76d9b612..79649b8fd 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -6,21 +6,26 @@ import ( "fmt" "io" "log" + "regexp" "sort" "strings" "time" "github.com/modelcontextprotocol/go-sdk/mcp" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" "github.com/skyhook-io/radar/internal/filter" "github.com/skyhook-io/radar/internal/helm" "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" "github.com/skyhook-io/radar/internal/search" + "github.com/skyhook-io/radar/internal/summarycontext" "github.com/skyhook-io/radar/internal/timeline" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" topology "github.com/skyhook-io/radar/pkg/topology" ) @@ -29,64 +34,92 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_dashboard", - Description: "Get cluster health overview including resource counts, " + - "problems (failing pods, unhealthy deployments), recent warning events, " + - "and Helm release status. Start here to understand cluster state before " + - "drilling into specific resources.", + Description: "Use for inventory-style cluster or namespace health triage, like " + + "`kubectl get all` plus detected problems and warning events in one call. " + + "Returns resource counts, failing pods, unhealthy workloads, recent Warning " + + "events, and Helm release status so you can rank likely suspects before " + + "calling get_resource or logs. Routing: unknown broken thing -> issues; " + + "content/name search -> search; service routing/dependencies -> get_topology " + + "or get_neighborhood; inventory/counts/Helm/events overview -> get_dashboard.", Annotations: readOnly, }, logToolCall("get_dashboard", handleGetDashboard)) mcp.AddTool(server, &mcp.Tool{ Name: "list_resources", - Description: "List Kubernetes resources of a given kind with minified summaries. " + - "Supports all built-in kinds (pods, deployments, services, etc.) and CRDs. " + - "Use to discover what's running before inspecting individual resources.", + Description: "Use for a jq-like namespace sweep when you know the resource kind " + + "(pods, deployments, services, configmaps, CRDs). Returns compact Kubernetes-shaped " + + "rows plus summaryContext by default (managedBy, health, issueCount) so you can " + + "compare many similar resources and pick suspects before calling get_resource. " + + "For unknown kind/name searches, use search. For broad health triage, use " + + "get_dashboard or issues first.", Annotations: readOnly, }, logToolCall("list_resources", handleListResources)) mcp.AddTool(server, &mcp.Tool{ Name: "get_resource", - Description: "Get detailed information about a single Kubernetes resource. " + - "Returns minified spec, status, and metadata. " + - "Use after list_resources to drill into a specific resource. " + - "Optionally include related context (events, relationships, metrics, logs) " + - "using the 'include' parameter (comma-separated) to avoid extra tool calls.", + Description: "Use AFTER narrowing to one resource. Returns the resource's " + + "Kubernetes-shaped spec/status/metadata plus resourceContext when available " + + "(relationships, refs, issue/audit/policy rollups). This is the drill-down " + + "tool, not the best first call for broad incidents. Start with issues, " + + "get_dashboard, search, or list_resources to rank candidates; then call " + + "get_resource for the exact object. If you are looking for a string across " + + "ConfigMaps, CRD specs, env refs, or object content, use search instead of " + + "fetching resources one by one. Use the group parameter for ambiguous " + + "kinds such as Knative Service vs core Service.", Annotations: readOnly, }, logToolCall("get_resource", handleGetResource)) mcp.AddTool(server, &mcp.Tool{ Name: "get_topology", - Description: "Get the topology graph showing relationships between Kubernetes resources. " + - "Returns nodes and edges representing Deployments, Services, Ingresses, Pods, etc. " + - "Use 'traffic' view for network flow or 'resources' view for ownership hierarchy.", + Description: "Use to map a multi-service incident or dependency graph, preferably " + + "scoped to a namespace. " + + "Returns Kubernetes resource nodes and edges (Services, workloads, Pods, " + + "Ingresses, ConfigMaps, Secrets, owners) so you can see service-to-workload " + + "traffic and ownership relationships instead of inspecting resources one by one. " + + "Use view=traffic for routing/connectivity questions and view=resources for " + + "ownership/deployment hierarchy. Always specify namespace unless you specifically " + + "need a cross-namespace graph. If you already know the suspicious root, use " + + "get_neighborhood for a smaller focused graph.", Annotations: readOnly, }, logToolCall("get_topology", handleGetTopology)) mcp.AddTool(server, &mcp.Tool{ Name: "get_neighborhood", - Description: "Get the BFS-expanded neighborhood of a specific resource — the slice " + - "of the topology graph immediately relevant to one root. Cheaper and more " + - "focused than get_topology when you already know which resource you care " + - "about. Profile is 'auto' (default — picks a bounded edge set from the root " + - "kind) or 'all' (every edge type). Hops controls BFS depth (default 1, max " + - "2). Nodes are RBAC-filtered against the caller; dropped neighbors are " + - "listed in `omitted` with reason=rbac_denied. If max_nodes is exceeded " + - "mid-expansion, truncated=true is set and a partial subgraph is returned.", + Description: "Use when investigating cross-resource failures around a known " + + "resource: service routing, targetPort/selector/endpoints problems, dependency " + + "timeouts, config/secret refs, owner chains, or traffic not reaching pods. " + + "Returns the BFS-expanded topology neighborhood around one root, which is " + + "usually cheaper and clearer than get_topology once you have a suspect. " + + "Typical flow: issues/search/list_resources identify a Service or workload, " + + "then get_neighborhood traces its upstream/downstream Services, workloads, " + + "Pods, refs, and owners. Profile auto (default) picks a bounded edge set " + + "from the root kind; profile all expands every edge type and is heavier, " + + "use it only when auto produced a too-narrow neighborhood. Hops defaults to " + + "1 and maxes at 2. Nodes are RBAC-filtered; denied neighbors appear only as " + + "aggregate omitted counts.", Annotations: readOnly, }, logToolCall("get_neighborhood", handleGetNeighborhood)) mcp.AddTool(server, &mcp.Tool{ Name: "get_events", - Description: "Get recent Kubernetes warning events, deduplicated and sorted by recency. " + - "Useful for diagnosing issues — shows event reason, message, and occurrence count.", + Description: "Use for recent Kubernetes Warning events after an overview points " + + "at a namespace or resource, or when the symptom is scheduling, pulling images, " + + "restarts, failed mounts, readiness, or controller errors. Events are deduplicated " + + "and sorted by recency with reason, message, and count. For a ranked issue list " + + "that includes problems/conditions, use issues first.", Annotations: readOnly, }, logToolCall("get_events", handleGetEvents)) mcp.AddTool(server, &mcp.Tool{ Name: "get_pod_logs", - Description: "Get filtered log lines from a pod, prioritizing errors and warnings. " + - "Returns diagnostically relevant lines (errors, panics, stack traces) or " + - "falls back to the last 20 lines if no error patterns match.", + Description: "Use only after narrowing to a specific Pod/container. Returns " + + "diagnostically relevant log lines (errors, panics, stack traces, warnings) " + + "or falls back to recent tail lines. Set grep to server-side filter like " + + "`kubectl logs | grep PATTERN` when you know an error string, request path, " + + "service name, or trace id. For broad incidents, first use issues, " + + "get_dashboard, search, list_resources, or get_neighborhood to avoid reading " + + "logs from many unrelated pods. If the target is a config value, feature flag, " + + "CRD field, env ref, or YAML/spec content, use search rather than logs.", Annotations: readOnly, }, logToolCall("get_pod_logs", handleGetPodLogs)) @@ -99,9 +132,12 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_changes", - Description: "Get recent resource changes (creates, updates, deletes) from the cluster timeline. " + - "Use to investigate what changed before an incident. " + - "Filter by namespace, resource kind, or specific resource name.", + Description: "Use when the symptom is 'this worked earlier' or 'something broke " + + "after a deploy/config change.' Returns a chronological feed of resource " + + "creates, updates, and deletes such as image changes, ConfigMap edits, scale " + + "events, label edits, and rollout churn. This is often faster than reading " + + "ReplicaSet histories or individual audit/log streams. Pair with since to " + + "bound the window; filter by namespace, kind, or name when you know the scope.", Annotations: readOnly, }, logToolCall("get_changes", handleGetChanges)) @@ -113,7 +149,10 @@ func registerTools(server *mcp.Server) { "with remediation guidance. Checks cover security (running as root, privileged " + "containers, dangerous capabilities), reliability (missing probes, single replicas, " + "no PDB), and efficiency (missing resource requests/limits). " + - "Each finding includes what's wrong and how to fix it. " + + "Each finding includes what's wrong and how to fix it. The findings list contains " + + "only resources with audit violations; resources absent from findings should not " + + "be reported as non-compliant. If findings is empty for the requested scope/filter, " + + "there are no audit violations to report for that scope/filter. " + "Respects user's audit settings (ignored namespaces, disabled checks). " + "Filter by namespace, category, or severity.", Annotations: readOnly, @@ -135,7 +174,7 @@ func registerTools(server *mcp.Server) { Description: "Get detailed information about a specific Helm release including owned resources " + "and their status. Optionally include values, revision history, or manifest diff between revisions " + "using the 'include' parameter (comma-separated: values, history, diff). " + - "For diff, also provide diff_revision_1 and optionally diff_revision_2.", + "diff_revision_1 and diff_revision_2 are only used when include contains diff.", Annotations: readOnly, }, logToolCall("get_helm_release", handleGetHelmRelease)) @@ -175,8 +214,10 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "issues", - Description: "Unified cluster-health view. Combines hardcoded problem detection " + - "(failing Deployments / StatefulSets / CronJobs / HPAs / Nodes / Jobs / PVCs), " + + Description: "Use when the symptom is broad, unknown, or you need a ranked " + + "list of likely broken resources. This is the fastest way to find what is " + + "unhealthy before inspecting individual objects. Combines hardcoded problem " + + "detection (failing Deployments / StatefulSets / CronJobs / HPAs / Nodes / Jobs / PVCs), " + "recent K8s Warning events, and a generic CRD .status.conditions[] " + "fallback that lights up Argo / Flux / Knative / Crossplane / cert-manager / " + "KEDA without per-integration code. Severity is normalized to " + @@ -185,13 +226,17 @@ func registerTools(server *mcp.Server) { "(PolicyReport findings) are excluded by default because each can run " + "50–1000+ rows per cluster. The `source` param is a FILTER: " + "source=kyverno returns ONLY Kyverno rows (no problems, no conditions). " + + "For compliance, security-posture, or audit questions, use source=audit " + + "only; source=problem, source=condition, and source=event are runtime " + + "health signals and should not be reported as audit violations. " + + "When source includes audit, only resources with audit findings are returned; " + + "do not infer or report audit violations for resources that are absent. " + "To ADD an excluded source to the defaults via MCP, list everything " + "you want explicitly — e.g. source=problem,condition,kyverno returns " + - "defaults plus Kyverno. (The REST /api/issues endpoint also exposes " + - "include_audit / include_events / include_kyverno boolean flags as " + - "shortcuts, but MCP only takes the source list.) Use this instead of " + - "get_dashboard when you want the full health picture across all " + - "sources, or to filter by severity / source / kind / namespace.", + "defaults plus Kyverno. For a broader inventory plus Helm and event overview, " + + "use get_dashboard. After identifying a suspect issue, call get_resource for " + + "exact spec/status or get_neighborhood when the failure likely crosses " + + "Services/workloads/Pods/dependencies.", Annotations: readOnly, }, logToolCall("issues", handleIssuesTool)) @@ -199,15 +244,21 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "search", - Description: "Free-text resource search across this cluster's cache. Matches on " + - "name, namespace, label values, annotation values, container images, and " + - "kind. Tokens are AND'd. Modifiers: kind:Pod, ns:foo, label:app=bar, " + - "image:redis. Returns ranked hits with optional summary or raw object. " + - "Use this instead of list_resources when you don't already know the kind, " + - "namespace, or exact name — for example 'find anything called redis' or " + - "'show me everything pulling from quay.io/x'. Searches typed kinds plus " + - "any CRDs already warmed in the cache; cold CRDs need a list_resources " + - "call first to start watching.", + Description: "Use when you do not know the exact kind, namespace, or name, or " + + "when you need a grep-like scan across cached Kubernetes objects. Matches " + + "name, namespace, label values, annotation values, container images, kind, " + + "and searchable object content such as ConfigMap data, spec fields, status " + + "messages, env refs, and CRD specs. Tokens are AND'd. Examples: " + + "`adServiceFailure` finds feature flags in ConfigMap data; " + + "`kind:NetworkChaos delay` or `kind:PodChaos app=cart` finds Chaos Mesh " + + "faults; `image:flagd` finds feature-flag infrastructure. Modifiers include " + + "kind:Pod, kind:NetworkChaos, ns:foo, label:app=bar, image:redis. Returns ranked hits with matched " + + "content snippets and summaryContext by default so you can rank suspects " + + "before get_resource. Use this for feature flags, Chaos Mesh objects, " + + "secret/config refs, unknown CRD names, or 'where does this string appear?' " + + "questions. Use CEL filter for structural predicates over kind/apiVersion/" + + "metadata/spec/status/labels/annotations. Searches typed kinds plus CRDs " + + "already warmed in cache; cold CRDs need list_resources first to start watching.", Annotations: readOnly, }, logToolCall("search", handleSearch)) @@ -235,9 +286,14 @@ func registerTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{ Name: "get_workload_logs", - Description: "Get aggregated, AI-filtered logs from all pods of a workload (Deployment, StatefulSet, " + - "or DaemonSet). Logs are collected from all matching pods concurrently, filtered for errors/warnings, " + - "and deduplicated. More useful than get_pod_logs when you need logs across all replicas of a workload.", + Description: "Get aggregated logs from all pods of a workload (Deployment, StatefulSet, " + + "or DaemonSet). Logs are collected from all matching pods concurrently, then " + + "server-side filtered to errors, warnings, panics, and stack traces using " + + "deterministic regex patterns and deduplicated. Set grep for additional " + + "server-side filtering before that summary stage, like `kubectl logs | grep PATTERN`. " + + "More useful than get_pod_logs when you need logs across all replicas of a workload. " + + "If the target is a config value, feature flag, CRD field, env ref, or YAML/spec " + + "content, use search rather than logs.", Annotations: readOnly, }, logToolCall("get_workload_logs", handleGetWorkloadLogs)) @@ -308,13 +364,14 @@ func registerTools(server *mcp.Server) { // Tool input types type dashboardInput struct { - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace. Use when triaging one app/tenant namespace before drilling into individual resources."` } type listResourcesInput struct { - Kind string `json:"kind" jsonschema:"resource kind to list, e.g. pods, deployments, services, configmaps"` + Kind string `json:"kind" jsonschema:"resource kind to list for a broad sweep, e.g. pods, deployments, services, configmaps. Prefer this before get_resource when comparing many same-kind objects."` Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. serving.knative.dev for Knative Service vs core Service)"` - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace for app-scoped triage"` + Context string `json:"context,omitempty" jsonschema:"per-row context: default attaches summaryContext (managedBy + health + issueCount) for suspect ranking; 'none' returns bare rows"` } type getResourceInput struct { @@ -322,12 +379,13 @@ type getResourceInput struct { Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. cluster.x-k8s.io for CAPI Cluster vs CNPG Cluster)"` Namespace string `json:"namespace" jsonschema:"resource namespace"` Name string `json:"name" jsonschema:"resource name"` - Include string `json:"include,omitempty" jsonschema:"comma-separated extras to include: events, relationships, metrics, logs"` + Include string `json:"include,omitempty" jsonschema:"optional sidecar data after narrowing to this object: events, relationships, metrics, logs. Separate from context; include may fetch heavier live/derived data."` + Context string `json:"context,omitempty" jsonschema:"resourceContext tier: default/basic attaches relationship and finding rollups (managedBy, exposes, selectedBy, uses, runsOn, issueSummary, auditSummary); 'none' returns a bare minified resource."` } type topologyInput struct { - Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` - View string `json:"view,omitempty" jsonschema:"view mode: traffic for network flow or resources for ownership hierarchy"` + Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace for a multi-service incident map; recommended unless you need cross-namespace topology"` + View string `json:"view,omitempty" jsonschema:"view mode: traffic for service routing/connectivity or resources for ownership hierarchy"` Format string `json:"format,omitempty" jsonschema:"output format: graph (default, full node/edge data) or summary (text description of resource chains)"` } @@ -351,13 +409,25 @@ type podLogsInput struct { Name string `json:"name" jsonschema:"pod name"` Container string `json:"container,omitempty" jsonschema:"container name, defaults to first container"` TailLines int `json:"tail_lines,omitempty" jsonschema:"number of lines to fetch from the end (default 200)"` + Grep string `json:"grep,omitempty" jsonschema:"optional regular expression to keep matching log lines before diagnostic filtering, like kubectl logs | grep PATTERN"` + Since string `json:"since,omitempty" jsonschema:"only return logs newer than this duration (e.g. 30s, 10m, 1h), like kubectl logs --since"` + Previous bool `json:"previous,omitempty" jsonschema:"return logs from the previous terminated container instance (e.g. for CrashLoopBackOff diagnosis), like kubectl logs -p"` } type searchInput struct { - Q string `json:"q" jsonschema:"search string. Free tokens AND'd. Modifiers: kind:Pod, ns:foo, label:k=v, image:redis"` + Query string `json:"query,omitempty" jsonschema:"search query for unknown resources or broad content scans. Free tokens AND'd. Matches identity plus searchable object content. Examples: adServiceFailure, kind:NetworkChaos delay, kind:ConfigMap flagd, image:flagd. Modifiers: kind:Pod, kind:NetworkChaos, ns:foo, label:k=v, image:redis"` + Q string `json:"q,omitempty" jsonschema:"alias for query"` Limit int `json:"limit,omitempty" jsonschema:"max hits returned (default 50, max 500)"` Include string `json:"include,omitempty" jsonschema:"per-hit detail: summary (default), raw, or none"` Filter string `json:"filter,omitempty" jsonschema:"optional CEL boolean expression run against each candidate K8s object. Bindings: kind, apiVersion, metadata, spec, status, labels, annotations. Use has(x.y) before optional fields. Examples: 'kind == \"Pod\" && status.phase == \"Failed\"', 'labels[\"app\"] == \"cart\"', 'has(status.readyReplicas) && status.readyReplicas == 0'"` + Context string `json:"context,omitempty" jsonschema:"per-hit context: default attaches summaryContext (managedBy + health + issueCount) for suspect ranking; 'none' returns bare hits"` +} + +func (in searchInput) query() string { + if strings.TrimSpace(in.Query) != "" { + return in.Query + } + return in.Q } type issuesInput struct { @@ -464,13 +534,27 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li listScope = nil } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service) reach the right + // resource. FetchResourceList is group-blind — it would silently + // return the core typed list, dropping the caller's group filter on + // the floor. Mirrors the group-aware short-circuit in REST + // handleAIListResources and handleGetResource (PR #721). + if group != "" { + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, listScope) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs. ClassifyKindScope/SAR // above already authorized cluster-scoped CRDs; namespaced CRDs - // are scoped via listScope. - return listDynamicResources(ctx, cache, kind, group, listScope) + // are scoped via listScope. Pass clusterScoped through so the + // issue index drops the namespace filter for cluster-scoped + // CRDs — those issues live at namespace="" and would otherwise + // be filtered out by the user's namespaced-access set. + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) } if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) @@ -492,28 +576,62 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li return nil, nil, fmt.Errorf("failed to minify: %w", err) } + // Attach summaryContext per row unless caller opted out. Issue index + // is scoped to the listed kind so the per-row count reflects only + // the resource being listed (not unrelated noise in the namespace). + // + // Cluster-scoped kinds (Node, PV, cluster-scoped CRDs) emit issues + // at namespace="" — scoping the index to the user's namespaced + // access set would silently zero issueCount on every row. The + // cluster-scoped RBAC gate above (canReadClusterScopedKind) already + // authorized the read, so we pass nil here to compose cluster-wide. + if input.Context != "none" { + idxNamespaces := allowed + if clusterScoped { + idxNamespaces = nil + } + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToTypedList(results, objs, builder) + } + } + return toJSONResult(results) } -func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string) (*mcp.CallToolResult, any, error) { - var allItems []any +func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, clusterScoped bool, contextMode string) (*mcp.CallToolResult, any, error) { + var rawItems []*unstructured.Unstructured if len(namespaces) > 0 { for _, ns := range namespaces { items, err := cache.ListDynamicWithGroup(ctx, kind, ns, group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) - } + rawItems = append(rawItems, items...) } } else { items, err := cache.ListDynamicWithGroup(ctx, kind, "", group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + rawItems = items + } + + allItems := make([]any, 0, len(rawItems)) + for _, item := range rawItems { + allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + } + + if contextMode != "none" { + // Cluster-scoped CRDs emit issues at namespace="" — passing a + // namespace-restricted slice would silently zero issueCount on + // every row. Caller has already gated cluster-scoped reads via + // canReadClusterScopedKind, so cluster-wide compose is safe. + idxNamespaces := namespaces + if clusterScoped { + idxNamespaces = nil + } + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToUnstructuredList(allItems, rawItems, builder) } } @@ -564,43 +682,119 @@ func handleGetResource(ctx context.Context, req *mcp.CallToolRequest, input getR } } - // Try typed cache first. rawObj is the un-minified resource, threaded - // into attachResourceExtras so ManagedBy synthesis can disambiguate by - // group (avoids Knative Service vs core Service kind/plural collisions). + // Fetch the resource. When group is set, skip the typed cache and route + // directly to the dynamic cache: typed FetchResource is group-blind + // (e.g. for kind=services it returns the core Service regardless of any + // group qualifier), so a group-qualified call like serving.knative.dev/ + // Service would silently leak the wrong object. Mirrors the same + // group-first dispatch fix on the REST GET path in PR #721. var resourceData any - var rawObj any - obj, err := k8s.FetchResource(cache, kind, namespace, name) - if err == k8s.ErrUnknownKind { - // Fall through to dynamic cache for CRDs + var rawObj runtime.Object + if group != "" { u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) if dynErr != nil { return nil, nil, fmt.Errorf("resource not found: %w", dynErr) } resourceData = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) rawObj = u - } else if err != nil { - return nil, nil, fmt.Errorf("resource not found: %w", err) } else { - k8s.SetTypeMeta(obj) - minified, minErr := aicontext.Minify(obj, aicontext.LevelDetail) - if minErr != nil { - return nil, nil, fmt.Errorf("failed to minify: %w", minErr) + obj, err := k8s.FetchResource(cache, kind, namespace, name) + if err == k8s.ErrUnknownKind { + u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if dynErr != nil { + return nil, nil, fmt.Errorf("resource not found: %w", dynErr) + } + resourceData = aicontext.MinifyUnstructured(u, aicontext.LevelDetail) + rawObj = u + } else if err != nil { + return nil, nil, fmt.Errorf("resource not found: %w", err) + } else { + k8s.SetTypeMeta(obj) + minified, minErr := aicontext.Minify(obj, aicontext.LevelDetail) + if minErr != nil { + return nil, nil, fmt.Errorf("failed to minify: %w", minErr) + } + resourceData = minified + rawObj = obj } - resourceData = minified - rawObj = obj } + // Build the resourceContext sidecar unless the caller opted out. Basic + // tier is the default: cheap managedBy / exposes / selectedBy / + // runsOn / uses / issueSummary / auditSummary / policySummary. Pass + // context=none for a bare minified resource (bulk scans, raw jq work). + contextMode := strings.ToLower(strings.TrimSpace(input.Context)) includes := parseIncludes(input.Include) - if len(includes) == 0 { + skipContext := contextMode == "none" + + var resourceCtx *resourcecontext.ResourceContext + if !skipContext { + resourceCtx = buildMCPResourceContext(ctx, rawObj, kind, namespace, name) + } + + // Three shapes: + // - bare resource: no includes, context=none + // - resource + resourceContext: no includes, default context + // - resource + resourceContext + extras: includes set + if len(includes) == 0 && resourceCtx == nil { return toJSONResult(resourceData) } - // Build enriched response with requested extras result := map[string]any{"resource": resourceData} - attachResourceExtras(ctx, cache, result, includes, kind, namespace, name, rawObj) + if resourceCtx != nil { + result["resourceContext"] = resourceCtx + } + if len(includes) > 0 { + attachResourceExtras(ctx, cache, result, includes, kind, namespace, name, rawObj) + } return toJSONResult(result) } +// buildMCPResourceContext assembles the resourceContext sidecar for MCP +// get_resource. Mirrors the REST handler's buildAIResourceContext: pre- +// computes IssueSummary + AuditSummary in the caller, threads the +// PolicyReport index when Kyverno is installed, hands a request-scoped +// RBAC checker to Build for per-ref gating, and lets Build's own +// fallback resolve Relationships via topology.GetRelationshipsWithObject +// (which applies KindForGVK so cross-group CRDs map to the right +// topology node). +func buildMCPResourceContext(ctx context.Context, obj runtime.Object, kind, namespace, name string) *resourcecontext.ResourceContext { + if obj == nil { + return nil + } + cache := k8s.GetResourceCache() + + gvk := obj.GetObjectKind().GroupVersionKind() + canonicalKind := gvk.Kind + if canonicalKind == "" { + canonicalKind = kind + } + canonicalGroup := gvk.Group + + issueSum := computeMCPIssueSummary(cache, canonicalGroup, canonicalKind, namespace, name) + auditSum := computeMCPAuditSummary(cache, canonicalGroup, canonicalKind, namespace, name) + + opts := resourcecontext.Options{ + Tier: resourcecontext.TierBasic, + AccessChecker: newMCPRequestScopedChecker(ctx), + IssueSummary: issueSum, + AuditSummary: auditSum, + ServiceBackends: mcpServiceBackendLookup{cache: cache}, + } + + if idx := k8s.GetPolicyReportIndex(); idx != nil { + opts.PolicyReports = mcpPolicyReportLookupAdapter{idx: idx} + } + + if topo, prov, dyn, ok := mcpTopologyForContext(namespace); ok { + opts.Topology = topo + opts.Provider = prov + opts.DynamicProv = dyn + } + + return resourcecontext.Build(ctx, obj, opts) +} + // attachResourceExtras populates optional extras (events, relationships, metrics, logs) // on the result map based on the includes set. rawObj is the already-fetched // resource (typed or *unstructured); passed through so relationship synthesis @@ -1151,9 +1345,20 @@ func handleGetPodLogs(ctx context.Context, req *mcp.CallToolRequest, input podLo if input.TailLines > 0 { tailLines = int64(input.TailLines) } + if strings.TrimSpace(input.Grep) != "" { + if _, err := regexp.Compile(input.Grep); err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } + } + sinceSeconds, err := parseLogsSince(input.Since) + if err != nil { + return nil, nil, err + } opts := &corev1.PodLogOptions{ - TailLines: &tailLines, + TailLines: &tailLines, + SinceSeconds: sinceSeconds, + Previous: input.Previous, } if input.Container != "" { opts.Container = input.Container @@ -1170,7 +1375,10 @@ func handleGetPodLogs(ctx context.Context, req *mcp.CallToolRequest, input podLo return nil, nil, fmt.Errorf("failed to read logs: %w", err) } - filtered := aicontext.FilterLogs(string(data)) + filtered, err := aicontext.FilterLogsByPattern(string(data), input.Grep) + if err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } return toJSONResult(filtered) } @@ -1989,7 +2197,11 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp if provider == nil { return nil, nil, fmt.Errorf("not connected to cluster") } - parsed := search.Parse(input.Q) + query := input.query() + if query == "" { + return nil, nil, fmt.Errorf("query is required") + } + parsed := search.Parse(query) allowed := filterNamespacesForUser(ctx, nil) if allowed != nil && len(allowed) == 0 { return toJSONResult(search.Result{Hits: []search.Hit{}}) @@ -2048,6 +2260,17 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp } opts.Filter = f } + // Search uses the dual-index variant: hits are mixed-kind (a single + // query can return both namespaced Pods and cluster-scoped Nodes), + // so a single namespace-scoped issue index zeroes issueCount on + // cluster-scoped hits whose problems live at namespace="". The + // builder routes per-hit by scope; CanReadClusterScoped above + // already gates which cluster-scoped kinds are reachable. + if input.Context != "none" { + if builder := newSearchSummaryContextBuilder(scanNamespaces); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } result, err := search.Search(ctx, provider, parsed, opts) if err != nil { return nil, nil, err diff --git a/internal/mcp/tools_apply.go b/internal/mcp/tools_apply.go index 5aea1dbbd..5a4206aab 100644 --- a/internal/mcp/tools_apply.go +++ b/internal/mcp/tools_apply.go @@ -88,4 +88,3 @@ func handleApplyResource(ctx context.Context, req *mcp.CallToolRequest, input ap "resources": results, }) } - diff --git a/internal/mcp/tools_audit.go b/internal/mcp/tools_audit.go index 7c60d8ab5..7211d1efd 100644 --- a/internal/mcp/tools_audit.go +++ b/internal/mcp/tools_audit.go @@ -16,14 +16,14 @@ type auditInput struct { Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` Category string `json:"category,omitempty" jsonschema:"filter by category: Security, Reliability, or Efficiency"` Severity string `json:"severity,omitempty" jsonschema:"filter by severity: danger or warning"` - Limit int `json:"limit,omitempty" jsonschema:"max findings to return (default 30, max 100)"` + Limit int `json:"limit,omitempty" jsonschema:"max audit violation findings to return (default 30, max 100). This limits findings only; compliant resources are not returned."` } type auditToolResult struct { - Summary auditSummary `json:"summary"` - Findings []auditFinding `json:"findings"` - TotalCount int `json:"totalCount"` - Truncated bool `json:"truncated,omitempty"` + Summary auditSummary `json:"summary"` + Findings []auditFinding `json:"findings"` + TotalCount int `json:"totalCount"` + Truncated bool `json:"truncated,omitempty"` } type auditSummary struct { @@ -34,10 +34,10 @@ type auditSummary struct { } type auditFinding struct { - Resource string `json:"resource"` // "Deployment/default/web" - Check string `json:"check"` // "runAsRoot" - Severity string `json:"severity"` // "danger" or "warning" - Category string `json:"category"` // "Security" + Resource string `json:"resource"` // "Deployment/default/web" + Check string `json:"check"` // "runAsRoot" + Severity string `json:"severity"` // "danger" or "warning" + Category string `json:"category"` // "Security" Message string `json:"message"` Remediation string `json:"remediation,omitempty"` } @@ -154,4 +154,3 @@ func loadAuditConfig() settings.AuditConfig { } return settings.DefaultAuditConfig() } - diff --git a/internal/mcp/tools_filter_test.go b/internal/mcp/tools_filter_test.go index 115a8a1dc..d32973de2 100644 --- a/internal/mcp/tools_filter_test.go +++ b/internal/mcp/tools_filter_test.go @@ -148,6 +148,46 @@ func containsName(payload, name string) bool { return strings.Contains(payload, `"name":"`+name+`"`) } +// TestHandleListResources_GroupRoutesToDynamic pins the group-aware +// short-circuit on the MCP list_resources path. For kind=services with +// no group, the typed core Service list returns the seeded fixture. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// listDynamicResources instead. Mirrors the REST-side fix in +// handleAIListResources and the GET-side fix from PR #721. +// +// setupFakeCacheForFilterTests doesn't initialize the dynamic cache, so +// the dynamic call surfaces an error. listDynamicResources wraps it in +// "failed to list %s: …" — pin both that the result does NOT contain +// the core Service AND that the call returned the dynamic-cache error +// (proving the routing change is in place). +func TestHandleListResources_GroupRoutesToDynamic(t *testing.T) { + setupFakeCacheForFilterTests(t) + ctx := withRestrictedUser(t, "alice", []string{"alpha"}) + + // With no group: typed cache, but no Services in the fixture so + // it's an empty list. Sanity check the baseline. + _, _, err := handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha"}) + if err != nil { + t.Fatalf("baseline (no group): %v", err) + } + + // With group=serving.knative.dev: must route to dynamic. The fake + // cache has no dynamic discovery wired, so we expect an error + // rather than a (wrong) 200 with typed core Services. + _, _, err = handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha", Group: "serving.knative.dev"}) + if err == nil { + t.Fatalf("group=serving.knative.dev: expected dynamic-cache routing error (no discovery in test harness), got nil err — handler may have silently returned typed core Services (pre-fix bug)") + } + // The wrapped error should reflect the dynamic path, not a typed + // cache lookup. Match loosely on shape so future error-text + // refactors don't flake the test. + if !strings.Contains(err.Error(), "services") { + t.Errorf("error should mention services kind: %v", err) + } +} + func TestHandleListResources_RestrictedUser(t *testing.T) { setupFakeCacheForFilterTests(t) @@ -596,6 +636,22 @@ func TestHandleSearch_Secrets_PerNamespaceFanout(t *testing.T) { } } +func TestHandleSearch_QueryAlias(t *testing.T) { + // Agents naturally call search with {"query": "..."}; keep accepting + // q, but make query work as the primary ergonomic field. + setupFakeCacheForFilterTests(t) + ctx := context.Background() + + result, _, err := handleSearch(ctx, nil, searchInput{Query: "alpha-pod"}) + if err != nil { + t.Fatalf("handleSearch: %v", err) + } + body := extractText(t, result) + if !containsName(body, "alpha-pod") { + t.Errorf("expected alpha-pod in search hits: %s", body) + } +} + func TestHandleSearch_Secrets_ClusterWideShape_NsFilter(t *testing.T) { // Regression for the bug where AllowedNamespaces==nil (cluster-wide // namespace sentinel) plus a concrete `ns:` filter took the cluster- @@ -626,3 +682,59 @@ func TestHandleSearch_Secrets_ClusterWideShape_NsFilter(t *testing.T) { t.Errorf("beta-secret leaked despite ns:alpha filter + per-ns RBAC: %s", body) } } + +func TestNormalizeWorkloadLogsKind_DefaultsToDeployment(t *testing.T) { + if got := normalizeWorkloadLogsKind(""); got != "deployments" { + t.Fatalf("blank workload-log kind = %q, want deployments", got) + } + if got := normalizeWorkloadLogsKind("statefulset"); got != "statefulsets" { + t.Fatalf("statefulset workload-log kind = %q, want statefulsets", got) + } +} + +func TestParseLogsSince(t *testing.T) { + tests := []struct { + name string + in string + wantSecs int64 + wantNil bool + wantError bool + }{ + {name: "empty returns nil", in: "", wantNil: true}, + {name: "whitespace returns nil", in: " ", wantNil: true}, + {name: "30s", in: "30s", wantSecs: 30}, + {name: "10m", in: "10m", wantSecs: 600}, + {name: "1h", in: "1h", wantSecs: 3600}, + {name: "sub-second floors to 1s", in: "500ms", wantSecs: 1}, + {name: "invalid format", in: "10minutes", wantError: true}, + {name: "negative duration", in: "-5m", wantError: true}, + {name: "zero duration", in: "0s", wantError: true}, + {name: "junk", in: "soon", wantError: true}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := parseLogsSince(tc.in) + if tc.wantError { + if err == nil { + t.Fatalf("expected error, got nil (result=%v)", got) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if tc.wantNil { + if got != nil { + t.Fatalf("expected nil, got %d", *got) + } + return + } + if got == nil { + t.Fatalf("expected %d, got nil", tc.wantSecs) + } + if *got != tc.wantSecs { + t.Fatalf("got %d, want %d", *got, tc.wantSecs) + } + }) + } +} diff --git a/internal/mcp/tools_helm.go b/internal/mcp/tools_helm.go index e4c805ff7..8600f31d3 100644 --- a/internal/mcp/tools_helm.go +++ b/internal/mcp/tools_helm.go @@ -33,8 +33,8 @@ type getHelmReleaseInput struct { Namespace string `json:"namespace" jsonschema:"release namespace"` Name string `json:"name" jsonschema:"release name"` Include string `json:"include,omitempty" jsonschema:"comma-separated extras to include: values, history, diff. Example: values,history"` - DiffRev1 int `json:"diff_revision_1,omitempty" jsonschema:"first revision for diff (requires include=diff)"` - DiffRev2 int `json:"diff_revision_2,omitempty" jsonschema:"second revision for diff (requires include=diff), defaults to current"` + DiffRev1 int `json:"diff_revision_1,omitempty" jsonschema:"first revision for diff; only used when include contains diff"` + DiffRev2 int `json:"diff_revision_2,omitempty" jsonschema:"second revision for diff; only used when include contains diff, defaults to current"` } // Helm tool handlers diff --git a/internal/mcp/tools_neighborhood.go b/internal/mcp/tools_neighborhood.go index a913267a5..b3a73e419 100644 --- a/internal/mcp/tools_neighborhood.go +++ b/internal/mcp/tools_neighborhood.go @@ -18,7 +18,7 @@ type getNeighborhoodInput struct { Group string `json:"group,omitempty" jsonschema:"API group required to disambiguate kinds that collide across groups. Examples: serving.knative.dev for KNative Service (vs core/v1 Service), cluster.x-k8s.io for CAPI Cluster (vs CNPG Cluster), networking.istio.io for Istio Gateway (vs gateway.networking.k8s.io Gateway). Omit for kinds with no known collisions."` Namespace string `json:"namespace,omitempty" jsonschema:"resource namespace; omit for cluster-scoped kinds"` Name string `json:"name" jsonschema:"resource name"` - Profile string `json:"profile,omitempty" jsonschema:"neighborhood breadth: auto or all. Default: auto (picks a bounded edge set from the root kind)."` + Profile string `json:"profile,omitempty" jsonschema:"neighborhood breadth: auto or all. Default: auto (picks a bounded edge set from the root kind). all expands every edge type and is heavier; use only when auto produced a too-narrow neighborhood."` Hops int `json:"hops,omitempty" jsonschema:"BFS depth. Default 1, max 2."` MaxNodes int `json:"max_nodes,omitempty" jsonschema:"node-budget cap. Default 25. When the cap is hit mid-expansion, truncated=true is set and the partial subgraph is returned."` } diff --git a/internal/mcp/tools_rbac.go b/internal/mcp/tools_rbac.go index 170727c08..7ab198976 100644 --- a/internal/mcp/tools_rbac.go +++ b/internal/mcp/tools_rbac.go @@ -35,12 +35,12 @@ type subjectPermissionsInput struct { } type subjectPermissionsResult struct { - Subject mcpSubject `json:"subject"` - Bindings []mcpBindingLite `json:"bindings"` + Subject mcpSubject `json:"subject"` + Bindings []mcpBindingLite `json:"bindings"` FlatRules []rbacv1.PolicyRule `json:"flatRules"` - Truncated bool `json:"truncated,omitempty"` - UsedByPods []string `json:"usedByPods,omitempty"` // "ns/name" pairs - PodsTotal int `json:"podsTotal,omitempty"` // >0 when usedByPods was truncated + Truncated bool `json:"truncated,omitempty"` + UsedByPods []string `json:"usedByPods,omitempty"` // "ns/name" pairs + PodsTotal int `json:"podsTotal,omitempty"` // >0 when usedByPods was truncated } type mcpSubject struct { @@ -53,7 +53,7 @@ type mcpSubject struct { // identify the binding and the role it grants; rule details are accessible // via get_resource on the role. type mcpBindingLite struct { - BindingKind string `json:"bindingKind"` // "RoleBinding" | "ClusterRoleBinding" + BindingKind string `json:"bindingKind"` // "RoleBinding" | "ClusterRoleBinding" BindingNamespace string `json:"bindingNamespace,omitempty"` BindingName string `json:"bindingName"` RoleKind string `json:"roleKind"` // "Role" | "ClusterRole" diff --git a/internal/mcp/tools_workloads.go b/internal/mcp/tools_workloads.go index ef990bba0..02e8a9379 100644 --- a/internal/mcp/tools_workloads.go +++ b/internal/mcp/tools_workloads.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "log" + "regexp" "sort" "strings" "sync" @@ -13,8 +14,8 @@ import ( "github.com/modelcontextprotocol/go-sdk/mcp" corev1 "k8s.io/api/core/v1" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" "github.com/skyhook-io/radar/internal/k8s" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" ) // Workload tool input types @@ -35,11 +36,37 @@ type manageCronJobInput struct { } type getWorkloadLogsInput struct { - Kind string `json:"kind" jsonschema:"workload kind: deployment, statefulset, or daemonset"` + Kind string `json:"kind,omitempty" jsonschema:"workload kind: deployment, statefulset, or daemonset. Defaults to deployment when omitted."` Namespace string `json:"namespace" jsonschema:"workload namespace"` Name string `json:"name" jsonschema:"workload name"` Container string `json:"container,omitempty" jsonschema:"specific container name, defaults to all containers"` TailLines int `json:"tail_lines,omitempty" jsonschema:"lines per pod (default 100)"` + Grep string `json:"grep,omitempty" jsonschema:"optional regular expression to keep matching log lines before diagnostic filtering, like kubectl logs | grep PATTERN"` + Since string `json:"since,omitempty" jsonschema:"only return logs newer than this duration (e.g. 30s, 10m, 1h), like kubectl logs --since"` + Previous bool `json:"previous,omitempty" jsonschema:"return logs from the previous terminated container instance (e.g. for CrashLoopBackOff diagnosis), like kubectl logs -p"` +} + +// parseLogsSince converts a relative duration string like "30s"/"10m"/"1h" +// into seconds for corev1.PodLogOptions.SinceSeconds. Empty input returns +// (nil, nil) so the caller can leave SinceSeconds unset. Negative or zero +// durations are rejected — kubectl's behavior on these is implementation- +// dependent and not useful for diagnosis. +func parseLogsSince(s string) (*int64, error) { + if strings.TrimSpace(s) == "" { + return nil, nil + } + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid since duration %q: %w (expected e.g. 30s, 10m, 1h)", s, err) + } + if d <= 0 { + return nil, fmt.Errorf("invalid since duration %q: must be positive", s) + } + secs := int64(d.Seconds()) + if secs < 1 { + secs = 1 + } + return &secs, nil } // Workload tool handlers @@ -141,7 +168,7 @@ func handleManageCronJob(ctx context.Context, req *mcp.CallToolRequest, input ma } func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input getWorkloadLogsInput) (*mcp.CallToolResult, any, error) { - kind := normalizeWorkloadKind(input.Kind) + kind := normalizeWorkloadLogsKind(input.Kind) if kind == "" { return nil, nil, fmt.Errorf("invalid kind %q: must be deployment, statefulset, or daemonset", input.Kind) } @@ -180,6 +207,15 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input if input.TailLines > 0 { tailLines = int64(input.TailLines) } + if strings.TrimSpace(input.Grep) != "" { + if _, err := regexp.Compile(input.Grep); err != nil { + return nil, nil, fmt.Errorf("invalid grep regex: %w", err) + } + } + sinceSeconds, err := parseLogsSince(input.Since) + if err != nil { + return nil, nil, err + } // Validate container name if specified if input.Container != "" { @@ -220,9 +256,11 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input defer wg.Done() opts := &corev1.PodLogOptions{ - Container: containerName, - TailLines: &tailLines, - Timestamps: true, + Container: containerName, + TailLines: &tailLines, + SinceSeconds: sinceSeconds, + Previous: input.Previous, + Timestamps: true, } entry := logEntry{ @@ -251,8 +289,8 @@ func handleGetWorkloadLogs(ctx context.Context, req *mcp.CallToolRequest, input return } - // Apply AI-optimized log filtering - entry.Logs = aicontext.FilterLogs(string(data)) + filtered, _ := aicontext.FilterLogsByPattern(string(data), input.Grep) + entry.Logs = filtered mu.Lock() allLogs = append(allLogs, entry) @@ -368,3 +406,10 @@ func normalizeWorkloadKind(kind string) string { return "" } } + +func normalizeWorkloadLogsKind(kind string) string { + if strings.TrimSpace(kind) == "" { + return "deployments" + } + return normalizeWorkloadKind(kind) +} diff --git a/internal/search/candidate.go b/internal/search/candidate.go index 2a2393ae5..c000807eb 100644 --- a/internal/search/candidate.go +++ b/internal/search/candidate.go @@ -1,6 +1,10 @@ package search import ( + "fmt" + "sort" + "strconv" + appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -25,6 +29,7 @@ func fromObject(obj runtime.Object, kind string) (candidate, bool) { Annotations: m.GetAnnotations(), } c.Images = imagesForTyped(obj) + c.Content = contentForTyped(obj, kind) return c, true } @@ -41,9 +46,81 @@ func fromUnstructured(u *unstructured.Unstructured, kind, group string) candidat Annotations: u.GetAnnotations(), } c.Images = imagesFromUnstructured(u) + c.Content = contentFromUnstructured(u, kind) return c } +func contentForTyped(obj runtime.Object, kind string) []ContentField { + if obj == nil { + return nil + } + // Secrets are intentionally not content-indexed. Search may expose Secret + // names to callers with Secret RBAC, but matching/snippeting data values + // would turn search into a secret-value disclosure path. + if kind == "Secret" { + return nil + } + m, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil + } + return contentFromMap(m, kind) +} + +func contentFromUnstructured(u *unstructured.Unstructured, kind string) []ContentField { + if u == nil || u.Object == nil || kind == "Secret" { + return nil + } + return contentFromMap(u.Object, kind) +} + +func contentFromMap(obj map[string]any, kind string) []ContentField { + var out []ContentField + if kind == "ConfigMap" { + walkContent("data", obj["data"], &out) + walkContent("binaryData", obj["binaryData"], &out) + return out + } + // These roots capture the useful grep-like surface without indexing noisy + // metadata such as managedFields or leaking Secret data values. + walkContent("spec", obj["spec"], &out) + walkContent("status", obj["status"], &out) + walkContent("data", obj["data"], &out) + return out +} + +func walkContent(path string, v any, out *[]ContentField) { + switch x := v.(type) { + case nil: + return + case string: + if x != "" { + *out = append(*out, ContentField{Path: path, Value: x}) + } + case bool: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatBool(x)}) + case int: + *out = append(*out, ContentField{Path: path, Value: strconv.Itoa(x)}) + case int64: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatInt(x, 10)}) + case float64: + *out = append(*out, ContentField{Path: path, Value: strconv.FormatFloat(x, 'f', -1, 64)}) + case map[string]any: + keys := make([]string, 0, len(x)) + for k := range x { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + walkContent(path+"."+k, x[k], out) + } + case []any: + for i, item := range x { + walkContent(fmt.Sprintf("%s[%d]", path, i), item, out) + } + } +} + func imagesForTyped(obj runtime.Object) []string { switch o := obj.(type) { case *corev1.Pod: diff --git a/internal/search/score.go b/internal/search/score.go index 16a407cc1..c877237e1 100644 --- a/internal/search/score.go +++ b/internal/search/score.go @@ -13,8 +13,11 @@ const ( scoreLabelValSubstr = 18 scoreAnnoSubstr = 15 scoreImageSubstr = 20 + scoreContentExact = 16 + scoreContentSubstr = 12 scoreKindExact = 10 scoreKindSubstr = 5 + maxSnippetRunes = 180 ) // candidate carries the searchable face of a K8s object: identity, @@ -28,63 +31,84 @@ type candidate struct { Labels map[string]string Annotations map[string]string Images []string + Content []ContentField +} + +// ContentField is a searchable string extracted from object content, such as +// ConfigMap data, workload env refs, CRD spec fields, or status messages. +type ContentField struct { + Path string + Value string } // match runs the parsed query against a candidate and returns the score // plus which sites matched. Returns (0, nil, false) when filters reject // the candidate or when at least one free token didn't land anywhere. -func match(q Query, c candidate) (int, []MatchedField, bool) { +func match(q Query, c candidate) (int, []MatchedField, []MatchSnippet, bool) { // Hard filters first — cheaper to reject early. if len(q.KindFilter) > 0 && !kindMatches(c.Kind, q.KindFilter) { - return 0, nil, false + return 0, nil, nil, false } if len(q.NSFilter) > 0 && !sliceContainsFold(q.NSFilter, c.Namespace) { - return 0, nil, false + return 0, nil, nil, false } for _, lf := range q.LabelFilter { v, ok := c.Labels[lf.Key] if !ok { - return 0, nil, false + return 0, nil, nil, false } if lf.Value != "" && v != lf.Value { - return 0, nil, false + return 0, nil, nil, false } } for _, img := range q.ImageFilter { if !anyContainsFold(c.Images, img) { - return 0, nil, false + return 0, nil, nil, false } } if len(q.Tokens) == 0 { // Pure-filter query: no scoring signal, but the candidate passed // every filter, so return a flat score so it shows up. - return 1, nil, true + return 1, nil, nil, true } total := 0 var matched []MatchedField + var snippets []MatchSnippet for _, tok := range q.Tokens { - best, site, ok := scoreToken(tok, c) + best, site, snippet, ok := scoreToken(tok, c) if !ok { - return 0, nil, false + return 0, nil, nil, false } total += best matched = append(matched, MatchedField{Token: tok, Site: site, Score: best}) + if snippet != nil { + snippets = append(snippets, *snippet) + } } - return total, matched, true + return total, matched, snippets, true } // scoreToken returns the highest-scoring site a single free token matches, // or (0, "", false) if the token doesn't land on any searchable field. -func scoreToken(tok string, c candidate) (int, string, bool) { +func scoreToken(tok string, c candidate) (int, string, *MatchSnippet, bool) { low := strings.ToLower(tok) best := 0 bestSite := "" + var bestSnippet *MatchSnippet consider := func(score int, site string) { if score > best { best = score bestSite = site + bestSnippet = nil + } + } + considerSnippet := func(score int, site string, snip MatchSnippet) { + if score > best { + best = score + bestSite = site + bestSnippet = &snip } } @@ -127,6 +151,26 @@ func scoreToken(tok string, c candidate) (int, string, bool) { consider(scoreImageSubstr, "image") } } + for _, cf := range c.Content { + if cf.Value == "" { + continue + } + vLow := strings.ToLower(cf.Value) + switch { + case vLow == low: + considerSnippet(scoreContentExact, "content:"+cf.Path, MatchSnippet{ + Token: tok, + Path: cf.Path, + Snippet: snippetForToken(cf.Value, tok), + }) + case strings.Contains(vLow, low): + considerSnippet(scoreContentSubstr, "content:"+cf.Path, MatchSnippet{ + Token: tok, + Path: cf.Path, + Snippet: snippetForToken(cf.Value, tok), + }) + } + } if c.Kind != "" { kindLow := strings.ToLower(c.Kind) switch { @@ -138,9 +182,44 @@ func scoreToken(tok string, c candidate) (int, string, bool) { } if best == 0 { - return 0, "", false + return 0, "", nil, false + } + return best, bestSite, bestSnippet, true +} + +func snippetForToken(value, tok string) string { + runes := []rune(value) + if len(runes) <= maxSnippetRunes { + return value + } + valueLow := strings.ToLower(value) + tokLow := strings.ToLower(tok) + byteIdx := strings.Index(valueLow, tokLow) + if byteIdx < 0 { + return string(runes[:maxSnippetRunes]) + } + prefixRunes := len([]rune(value[:byteIdx])) + half := maxSnippetRunes / 2 + start := prefixRunes - half + if start < 0 { + start = 0 + } + end := start + maxSnippetRunes + if end > len(runes) { + end = len(runes) + start = end - maxSnippetRunes + if start < 0 { + start = 0 + } + } + snippet := string(runes[start:end]) + if start > 0 { + snippet = "..." + snippet + } + if end < len(runes) { + snippet += "..." } - return best, bestSite, true + return snippet } // kindMatches returns true if any of the kind filters refer to the candidate kind. diff --git a/internal/search/score_test.go b/internal/search/score_test.go index 000f1a3be..16813b523 100644 --- a/internal/search/score_test.go +++ b/internal/search/score_test.go @@ -1,6 +1,9 @@ package search -import "testing" +import ( + "strings" + "testing" +) func cand() candidate { return candidate{ @@ -15,7 +18,7 @@ func cand() candidate { func TestMatch_FreeTokenScoresHighestSite(t *testing.T) { q := Parse("redis") - score, _, ok := match(q, cand()) + score, _, _, ok := match(q, cand()) if !ok { t.Fatal("expected match") } @@ -27,7 +30,7 @@ func TestMatch_FreeTokenScoresHighestSite(t *testing.T) { func TestMatch_TwoTokensSummed(t *testing.T) { q := Parse("redis cache") - score, matched, ok := match(q, cand()) + score, matched, _, ok := match(q, cand()) if !ok { t.Fatal("expected match") } @@ -42,57 +45,57 @@ func TestMatch_TwoTokensSummed(t *testing.T) { func TestMatch_TokenMustMatchSomewhere(t *testing.T) { q := Parse("redis nope-not-here") - if _, _, ok := match(q, cand()); ok { + if _, _, _, ok := match(q, cand()); ok { t.Fatal("expected no match — second token must reject") } } func TestMatch_KindFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("kind:Service"), c); ok { + if _, _, _, ok := match(Parse("kind:Service"), c); ok { t.Fatal("kind:Service should reject a Pod candidate") } - if _, _, ok := match(Parse("kind:Pod"), c); !ok { + if _, _, _, ok := match(Parse("kind:Pod"), c); !ok { t.Fatal("kind:Pod should match a Pod candidate") } // Pluralized form too — radar fetch.go uses lowercase plural keys. - if _, _, ok := match(Parse("kind:pods"), c); !ok { + if _, _, _, ok := match(Parse("kind:pods"), c); !ok { t.Fatal("kind:pods should match") } } func TestMatch_NSFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("ns:dev"), c); ok { + if _, _, _, ok := match(Parse("ns:dev"), c); ok { t.Fatal("ns:dev should reject prod candidate") } - if _, _, ok := match(Parse("ns:prod"), c); !ok { + if _, _, _, ok := match(Parse("ns:prod"), c); !ok { t.Fatal("ns:prod should match") } } func TestMatch_LabelFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("label:app=redis"), c); !ok { + if _, _, _, ok := match(Parse("label:app=redis"), c); !ok { t.Fatal("label:app=redis should match") } - if _, _, ok := match(Parse("label:app=postgres"), c); ok { + if _, _, _, ok := match(Parse("label:app=postgres"), c); ok { t.Fatal("label:app=postgres should reject") } - if _, _, ok := match(Parse("label:app"), c); !ok { + if _, _, _, ok := match(Parse("label:app"), c); !ok { t.Fatal("label:app (key-only) should match when label exists") } - if _, _, ok := match(Parse("label:missing"), c); ok { + if _, _, _, ok := match(Parse("label:missing"), c); ok { t.Fatal("label:missing should reject when label absent") } } func TestMatch_ImageFilter(t *testing.T) { c := cand() - if _, _, ok := match(Parse("image:redis"), c); !ok { + if _, _, _, ok := match(Parse("image:redis"), c); !ok { t.Fatal("image:redis should match") } - if _, _, ok := match(Parse("image:nginx"), c); ok { + if _, _, _, ok := match(Parse("image:nginx"), c); ok { t.Fatal("image:nginx should reject") } } @@ -100,7 +103,7 @@ func TestMatch_ImageFilter(t *testing.T) { func TestMatch_PureFilterReturnsFlatScore(t *testing.T) { // Filter-only query (no free tokens) should return a positive flat // score so candidates show up at all. - score, _, ok := match(Parse("kind:Pod ns:prod"), cand()) + score, _, _, ok := match(Parse("kind:Pod ns:prod"), cand()) if !ok || score <= 0 { t.Fatalf("filter-only match: score=%d ok=%v", score, ok) } @@ -108,11 +111,32 @@ func TestMatch_PureFilterReturnsFlatScore(t *testing.T) { func TestMatch_CaseInsensitive(t *testing.T) { q := Parse("REDIS") - if _, _, ok := match(q, cand()); !ok { + if _, _, _, ok := match(q, cand()); !ok { t.Fatal("expected case-insensitive match") } } +func TestMatch_ContentSnippet(t *testing.T) { + c := cand() + c.Content = []ContentField{{ + Path: "data.flags.json", + Value: `{"adServiceFailure":{"defaultVariant":"on"}}`, + }} + score, matched, snippets, ok := match(Parse("adServiceFailure"), c) + if !ok { + t.Fatal("expected content match") + } + if score != scoreContentSubstr { + t.Fatalf("score=%d, expected content score %d", score, scoreContentSubstr) + } + if len(matched) != 1 || matched[0].Site != "content:data.flags.json" { + t.Fatalf("matched=%+v", matched) + } + if len(snippets) != 1 || snippets[0].Path != "data.flags.json" || !strings.Contains(snippets[0].Snippet, "adServiceFailure") { + t.Fatalf("snippets=%+v", snippets) + } +} + func TestKindMatches_Variants(t *testing.T) { cases := []struct { kind, filter string diff --git a/internal/search/search.go b/internal/search/search.go index 4d05f75c7..ccb536e0c 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -22,8 +22,23 @@ import ( "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" ) +// SummaryBuilderFunc, when supplied via Options.SummaryBuilder, is +// invoked once per matched hit to produce the compact SummaryContext +// attached to the hit's summaryContext field. Exactly one of obj/u will be +// non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil +// is fine (the field is omitempty); callers use it to gate context +// emission per request (context=none opts out by passing nil here). +// +// group is the candidate's API group (already known to the search +// walker — typed kinds via typedKinds, CRDs via gvr.Group). Threading +// it through lets the builder distinguish CRDs that share +// kind+namespace+name across groups (e.g. Knative Service vs corev1 +// Service) in its per-resource issue index. +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext + // Provider abstracts the cache so tests can inject a fake. type Provider interface { ListTyped(kind string, namespaces []string) ([]runtime.Object, error) @@ -106,9 +121,27 @@ type Options struct { // drop the candidate. Compile happens in the handler; this layer // just runs the program. Filter *CELFilter + // SummaryBuilder, when non-nil, is invoked per matched hit to + // attach the compact summaryContext (managedBy + health + + // issueCount). Handlers provide a closure that wraps the + // request-scoped topology + per-namespace issue index so the + // per-row cost stays flat. Pass nil to opt out (context=none) — + // the field is omitempty and consumers must tolerate its absence. + SummaryBuilder SummaryBuilderFunc } // Search runs the parsed query against the provider and returns ranked hits. +// pendingHit pairs a Hit with the source object that produced it, so the +// SummaryBuilder (topology lookups, issue-index reads) can be deferred +// until AFTER the hits are sorted and truncated to opts.Limit. Lifecycle is +// strictly internal to Search — never escapes the function. +type pendingHit struct { + hit Hit + obj runtime.Object // typed source (nil for CRD hits) + u *unstructured.Unstructured // unstructured source (nil for typed hits) + c candidate // for c.Group/Kind/Namespace/Name when invoking SummaryBuilder +} + func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, error) { if opts.Limit <= 0 { opts.Limit = DefaultLimit @@ -118,7 +151,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } var res Result - var hits []Hit + // Buffer hits along with the source object so summaryBuilder (topology + // lookups, issue-index reads) can run AFTER sort + truncate — without + // this, broad queries pay topology lookups for thousands of matches + // only to ship at most opts.Limit of them. + var pending []pendingHit // CEL filter eval errors are silently dropped per-row (the agent // just gets fewer hits, no 500), but we log the first error so an // operator can see when rows are dying to runtime issues — typical @@ -176,7 +213,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } c.Group = tk.Group - score, matched, ok := match(q, c) + score, matched, snippets, ok := match(q, c) if !ok { continue } @@ -204,7 +241,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, obj, nil)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, snippets, c, opts.Include, obj, nil), + obj: obj, + c: c, + }) } } @@ -244,7 +285,7 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err res.Searched += len(items) for _, u := range items { c := fromUnstructured(u, kind, gvr.Group) - score, matched, ok := match(q, c) + score, matched, snippets, ok := match(q, c) if !ok { continue } @@ -270,7 +311,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, nil, u)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, snippets, c, opts.Include, nil, u), + u: u, + c: c, + }) } } @@ -282,21 +327,34 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } - sort.SliceStable(hits, func(i, j int) bool { - if hits[i].Score != hits[j].Score { - return hits[i].Score > hits[j].Score + sort.SliceStable(pending, func(i, j int) bool { + if pending[i].hit.Score != pending[j].hit.Score { + return pending[i].hit.Score > pending[j].hit.Score } - if hits[i].Kind != hits[j].Kind { - return hits[i].Kind < hits[j].Kind + if pending[i].hit.Kind != pending[j].hit.Kind { + return pending[i].hit.Kind < pending[j].hit.Kind } - if hits[i].Namespace != hits[j].Namespace { - return hits[i].Namespace < hits[j].Namespace + if pending[i].hit.Namespace != pending[j].hit.Namespace { + return pending[i].hit.Namespace < pending[j].hit.Namespace } - return hits[i].Name < hits[j].Name + return pending[i].hit.Name < pending[j].hit.Name }) - res.TotalMatched = len(hits) - if len(hits) > opts.Limit { - hits = hits[:opts.Limit] + res.TotalMatched = len(pending) + if len(pending) > opts.Limit { + pending = pending[:opts.Limit] + } + + // Summary attach happens HERE — after truncation — so the topology + // lookups + issue-index reads only run for the hits we'll actually + // ship. Skipped entirely when SummaryBuilder is nil (caller opted out + // via context=none). + hits := make([]Hit, len(pending)) + for i := range pending { + hits[i] = pending[i].hit + if opts.SummaryBuilder != nil { + c := pending[i].c + hits[i].SummaryContext = opts.SummaryBuilder(pending[i].obj, pending[i].u, c.Group, c.Kind, c.Namespace, c.Name) + } } res.Hits = hits res.Total = len(hits) @@ -345,8 +403,11 @@ func isClusterScopedKind(kind string) bool { // buildHit assembles the response shape for a matched candidate. Exactly // one of obj/u will be non-nil. minify-on-demand keeps the cost of -// IncludeNone (identity-only) flat. -func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { +// IncludeNone (identity-only) flat. SummaryContext attachment is NOT +// done here — it happens in Search's post-truncation loop so the +// expensive topology lookups + issue-index reads only run for the hits +// that survive sort + Limit truncation. +func buildHit(score int, matched []MatchedField, snippets []MatchSnippet, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { h := Hit{ Score: score, Kind: c.Kind, @@ -354,6 +415,7 @@ func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, Namespace: c.Namespace, Name: c.Name, Matched: matched, + Snippets: snippets, } switch mode { case IncludeSummary: diff --git a/internal/search/search_test.go b/internal/search/search_test.go index 91f8ed0dc..4b97e51cf 100644 --- a/internal/search/search_test.go +++ b/internal/search/search_test.go @@ -118,6 +118,69 @@ func TestSearch_ImageMatch(t *testing.T) { } } +func TestSearch_ConfigMapDataMatchWithSnippet(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "configmaps": { + &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "flagd-config", Namespace: "astronomy-shop"}, + Data: map[string]string{ + "flags.json": `{"adServiceFailure":{"defaultVariant":"on"}}`, + }, + }, + }, + }, + } + res, err := Search(context.Background(), p, Parse("adServiceFailure"), Options{Include: IncludeNone}) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected configmap content hit, got %+v", res.Hits) + } + h := res.Hits[0] + if h.Kind != "ConfigMap" || h.Name != "flagd-config" { + t.Fatalf("wrong hit: %+v", h) + } + if len(h.Snippets) != 1 || h.Snippets[0].Path != "data.flags.json" { + t.Fatalf("expected data snippet, got %+v", h.Snippets) + } +} + +func TestSearch_DynamicSpecMatchWithSnippet(t *testing.T) { + gvr := schema.GroupVersionResource{Group: "chaos-mesh.org", Version: "v1alpha1", Resource: "networkchaos"} + u := &unstructured.Unstructured{Object: map[string]any{ + "apiVersion": "chaos-mesh.org/v1alpha1", + "kind": "NetworkChaos", + "metadata": map[string]any{ + "name": "net-fault", + "namespace": "hotel", + }, + "spec": map[string]any{ + "selector": map[string]any{ + "labelSelectors": map[string]any{ + "app": "user", + }, + }, + "action": "delay", + }, + }} + p := &fakeProvider{ + dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{gvr: {u}}, + kinds: map[schema.GroupVersionResource]string{gvr: "NetworkChaos"}, + } + res, err := Search(context.Background(), p, Parse("delay"), Options{Include: IncludeNone}) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected CRD content hit, got %+v", res.Hits) + } + if len(res.Hits[0].Snippets) != 1 || res.Hits[0].Snippets[0].Path != "spec.action" { + t.Fatalf("expected spec snippet, got %+v", res.Hits[0].Snippets) + } +} + func TestSearch_LimitTruncates(t *testing.T) { pods := make([]runtime.Object, 0, 100) for i := 0; i < 100; i++ { diff --git a/internal/search/summary_context_test.go b/internal/search/summary_context_test.go new file mode 100644 index 000000000..0f8b3228e --- /dev/null +++ b/internal/search/summary_context_test.go @@ -0,0 +1,92 @@ +package search + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// TestSearch_SummaryBuilderAttached pins the wiring: when Options.SummaryBuilder +// is non-nil, the executor invokes it per kept hit and the result lands +// in Hit.SummaryContext. +func TestSearch_SummaryBuilderAttached(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, + }, + }, + }, + }, + } + + var calls int + var gotGroup string + builder := func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + calls++ + gotGroup = group + return &resourcecontext.ResourceSummaryContext{ + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: namespace}, + Health: "healthy", + IssueCount: 0, + } + } + + res, _ := Search(context.Background(), p, Parse("api-1"), Options{ + Include: IncludeNone, + SummaryBuilder: builder, + }) + if calls != 1 { + t.Fatalf("SummaryBuilder calls = %d, want 1", calls) + } + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + h := res.Hits[0] + if h.SummaryContext == nil { + t.Fatalf("SummaryContext not attached to hit: %+v", h) + } + if h.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", h.SummaryContext.Health) + } + if h.SummaryContext.ManagedBy == nil || h.SummaryContext.ManagedBy.Name != "api" { + t.Errorf("ManagedBy mismatch: %+v", h.SummaryContext.ManagedBy) + } + // Pod is core-group — builder should see "" for group, threaded + // through from candidate.Group (set on the typed walker via tk.Group). + if gotGroup != "" { + t.Errorf("builder saw group=%q for core-group Pod, want \"\"", gotGroup) + } +} + +// TestSearch_NoSummaryBuilder_LeavesNilContext is the opt-out path +// (context=none in the handler maps to nil SummaryBuilder here). Hits +// must have no SummaryContext. +func TestSearch_NoSummaryBuilder_LeavesNilContext(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + }, + }, + }, + } + res, _ := Search(context.Background(), p, Parse("api-1"), Options{Include: IncludeNone}) + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + if res.Hits[0].SummaryContext != nil { + t.Errorf("expected nil SummaryContext when SummaryBuilder unset, got %+v", res.Hits[0].SummaryContext) + } +} diff --git a/internal/search/types.go b/internal/search/types.go index 83551ba86..b1d650fc5 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -1,6 +1,9 @@ package search -import "github.com/skyhook-io/radar/internal/filter" +import ( + "github.com/skyhook-io/radar/internal/filter" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) const ( DefaultLimit = 50 @@ -46,15 +49,29 @@ type Hit struct { Summary any `json:"summary,omitempty"` Raw any `json:"raw,omitempty"` Matched []MatchedField `json:"matched,omitempty"` + Snippets []MatchSnippet `json:"snippets,omitempty"` + // SummaryContext is the compact per-row enrichment (managedBy, health, + // issueCount). Populated by handlers via Options.SummaryBuilder; nil + // when the caller opted out (context=none) or no fields apply. + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // MatchedField records where a query token landed (debug + UI highlight). type MatchedField struct { Token string `json:"token"` - Site string `json:"site"` // "name" | "namespace" | "label:k" | "annotation:k" | "image" | "kind" + Site string `json:"site"` // "name" | "namespace" | "label:k" | "annotation:k" | "image" | "kind" | "content:path" Score int `json:"score"` } +// MatchSnippet is a short excerpt from a content field that matched a free +// token. It lets agents use search as a cheap grep-like first pass without +// fetching the full resource body for every hit. +type MatchSnippet struct { + Token string `json:"token"` + Path string `json:"path"` + Snippet string `json:"snippet"` +} + // Result is the full response shape for a search request. type Result struct { Hits []Hit `json:"hits"` @@ -86,4 +103,3 @@ const ( IncludeRaw IncludeNone // identity only (cheapest) ) - diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 6ea130bac..68511d576 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -1,17 +1,99 @@ +// /api/ai/* is the REST mirror of the MCP agent surface. Both target AI +// consumers (Claude, scripted agents) rather than the SPA, and both +// intentionally evolve at agent-iteration speed. +// +// Unlike /api/* (consumed by the SPA via a generated TypeScript client), +// the /api/ai/* surface is NOT specified in openapi.yaml. The original +// motivation for OpenAPI-first in radar was frontend/backend type safety — +// one spec, regenerated as Go server stubs + TS client. That value +// proposition does not apply here: the agent consumer doesn't read +// OpenAPI specs (it reads MCP tool descriptions or in-prompt instructions), +// and the SPA doesn't call these endpoints at all. +// +// Wire shapes for the agent surface live in pkg/resourcecontext (typed +// JSON DTOs) and pkg/topology. MCP tools document their wire via +// jsonschema struct tags. /api/ai/* follows the same code-defined +// discipline as MCP, treating them as one logical surface served over +// two protocols. +// +// Revisit this opt-out when: +// +// (a) the agent surface stabilizes (no major shape changes for two +// release cycles), AND +// (b) Skyhook commits to a public customer-facing AI SDK that needs +// generated bindings. +// +// Until both conditions are met, bringing /api/ai/* under openapi.yaml +// is premature — it would pay the spec-authoring tax during evolution +// without earning the SDK-generation benefit. package server import ( + "context" "fmt" "net/http" + "sort" "strings" "github.com/go-chi/chi/v5" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" - aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/audit" + "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + bpaudit "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" ) +// policyReportLookupAdapter wraps internal/k8s.GetPolicyReportIndex() into +// the resourcecontext.PolicyReportLookup interface, translating the +// richer pkg/policyreports.Finding shape (which carries Severity + +// Category) into the agent-facing resourcecontext.KyvernoFinding shape +// (Policy / Rule / Result / Message only). Keeping the projection narrow +// here lets unrelated changes to policyreports.Finding evolve without +// perturbing the wire contract that downstream callers depend on. +type policyReportLookupAdapter struct { + idx *policyreports.Index +} + +func (a policyReportLookupAdapter) FindingsFor(group, kind, namespace, name string) []resourcecontext.KyvernoFinding { + if a.idx == nil { + return nil + } + findings := a.idx.FindingsFor(group, kind, namespace, name) + if len(findings) == 0 { + return nil + } + out := make([]resourcecontext.KyvernoFinding, len(findings)) + for i, f := range findings { + out[i] = resourcecontext.KyvernoFinding{ + Policy: f.Policy, + Rule: f.Rule, + Result: f.Result, + Message: f.Message, + } + } + return out +} + +type serviceBackendLookup struct { + cache *k8s.ResourceCache +} + +func (l serviceBackendLookup) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + if l.cache == nil || l.cache.Pods() == nil { + return nil, nil + } + return l.cache.Pods().Pods(namespace).List(selector) +} + // parseVerbosity reads the ?verbosity= query parameter and returns the matching level. func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aicontext.VerbosityLevel { switch r.URL.Query().Get("verbosity") { @@ -27,19 +109,33 @@ func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aico } // handleAIListResources returns a minified list of resources for AI consumption. -// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact +// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact&context=none +// +// summaryContext (managedBy + health + issueCount) is attached per row +// at Summary verbosity by default. Pass ?context=none to opt out for a +// bare list. func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return } - kind := chi.URLParam(r, "kind") + kind := normalizeKind(chi.URLParam(r, "kind")) + group := r.URL.Query().Get("group") + level := parseVerbosity(r, aicontext.LevelSummary) + skipContext := r.URL.Query().Get("context") == "none" + + // parseNamespacesForUser primes the per-user perm cache. preflightResourceList + // then enforces the same RBAC gates as the REST list path (cluster-scoped + // SAR for cluster-only kinds, list-namespaces SAR for `kind=namespaces`, + // per-namespace and/or cluster-wide list-secrets SAR for `kind=secrets`). + // AI callers get an explicit 403 on deny instead of the empty-list shape + // the REST handler returns for backward compat. namespaces := s.parseNamespacesForUser(r) - if noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) + finalNamespaces, status, msg, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeError(w, status, msg) return } - group := r.URL.Query().Get("group") - level := parseVerbosity(r, aicontext.LevelSummary) + namespaces = finalNamespaces cache := k8s.GetResourceCache() if cache == nil { @@ -47,11 +143,23 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service, KEDA's HPA-like + // kinds) reach the right resource. FetchResourceList is group-blind + // — it would silently return the core typed list, dropping the + // query's group filter on the floor. Mirrors the same group-aware + // short-circuit in handleGetResource (PR #721). + if group != "" { + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) + return + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, namespaces) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs - s.aiListDynamic(w, r, cache, kind, namespaces, group, level) + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) return } if err != nil { @@ -69,11 +177,43 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } + // Attach summaryContext per row at Summary verbosity. Compact/Detail + // already carry richer context on the get-resource path; the + // per-row attachment is specifically for cheap list triage. + // + // For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) issues + // live at namespace="" — scoping the issue index to the user's + // namespace set would silently zero issueCount on every row. The + // preflight RBAC above has already authorized cluster-scoped reads, + // so we pass nil here to compose cluster-wide. + if !skipContext && level == aicontext.LevelSummary { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + // Typed list resolves group from each object's TypeMeta — + // MinifyList sets it via SetTypeMeta before producing rows, + // so we can trust apiVersion on the typed source. + summarycontext.AttachToTypedList(results, objs, builder) + } + } + s.writeJSON(w, results) } +// issueIndexNamespaces returns the namespace slice to scope the issue +// index by. For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) +// returns nil so cluster-scoped issues (which live at namespace="") are +// not filtered out by the user's namespace-restricted access set. +// Namespaced kinds pass through unchanged. +func issueIndexNamespaces(namespaces []string, kind, group string) []string { + clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group) + if clusterScoped { + return nil + } + return namespaces +} + // aiListDynamic handles the CRD/dynamic fallback for AI list. -func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel) { +func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel, skipContext bool) { var allItems []*unstructured.Unstructured if len(namespaces) > 0 { @@ -107,11 +247,33 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 results = append(results, aicontext.MinifyUnstructured(item, level)) } + if !skipContext && level == aicontext.LevelSummary { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToUnstructuredList(results, allItems, builder) + } + } + s.writeJSON(w, results) } -// handleAIGetResource returns a single minified resource for AI consumption. -// GET /api/ai/resources/{kind}/{namespace}/{name}?group=X&verbosity=summary|detail|compact +// handleAIGetResource returns a single minified resource for AI consumption, +// wrapped with a resourceContext enrichment block by default. +// +// GET /api/ai/resources/{kind}/{namespace}/{name} +// +// Query params: +// - group=X API group disambiguator for CRDs. +// - verbosity=... summary | detail | compact (default: detail). +// - context=none Skip resourceContext build, return bare minified resource. +// +// Response shape (default): +// +// { "resource": , "resourceContext": { ...basic tier... } } +// +// Response shape (context=none): +// +// func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return @@ -121,53 +283,371 @@ func (s *Server) handleAIGetResource(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") group := r.URL.Query().Get("group") level := parseVerbosity(r, aicontext.LevelDetail) + skipContext := r.URL.Query().Get("context") == "none" // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace if namespace == "_" { namespace = "" } + // Run the same RBAC preflight as handleGetResource — the AI endpoint + // returns the same resource bytes (just minified) and must gate on the + // same per-user SAR / namespace-access tuple. Without this, a user with + // no `get secrets` SAR could read Secret values via /api/ai/resources/… + // even though /api/resources/… correctly returns 403. Runs BEFORE the + // fetch so cluster-scoped denies don't leak existence by status code. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") return } - // Try typed cache first - obj, err := k8s.FetchResource(cache, kind, namespace, name) - if err == k8s.ErrUnknownKind { - // Fall through to dynamic cache for CRDs - u, dynErr := cache.GetDynamicWithGroup(r.Context(), kind, namespace, name, group) - if dynErr != nil { - if strings.Contains(dynErr.Error(), "unknown resource kind") { - s.writeError(w, http.StatusBadRequest, dynErr.Error()) - return - } - if strings.Contains(dynErr.Error(), "not found") { - s.writeError(w, http.StatusNotFound, dynErr.Error()) - return - } - s.writeError(w, http.StatusInternalServerError, dynErr.Error()) - return - } - s.writeJSON(w, aicontext.MinifyUnstructured(u, level)) - return - } + obj, isUnstructured, err := s.fetchAIResource(r.Context(), cache, kind, namespace, name, group) if err != nil { - if strings.HasPrefix(err.Error(), "forbidden:") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) - return - } - s.writeError(w, http.StatusNotFound, err.Error()) + s.writeAIFetchError(w, kind, err) return } - k8s.SetTypeMeta(obj) - result, err := aicontext.Minify(obj, level) + if !isUnstructured { + k8s.SetTypeMeta(obj) + } + + minified, err := minifyForAI(obj, isUnstructured, level) if err != nil { s.writeError(w, http.StatusInternalServerError, err.Error()) return } - s.writeJSON(w, result) + if skipContext { + s.writeJSON(w, minified) + return + } + + rc := s.buildAIResourceContext(r, obj, kind, namespace, name) + s.writeJSON(w, map[string]any{ + "resource": minified, + "resourceContext": rc, + }) +} + +// fetchAIResource resolves the resource from the typed cache or dynamic cache. +// The bool reports whether the returned object is an unstructured (CRD) value. +// +// When a group is provided, the typed cache is skipped entirely and the +// dynamic cache is consulted with the group qualifier. This prevents kind +// collisions where a CRD plural shadows a core kind (e.g., Knative +// serving.knative.dev/Service vs core/v1 Service): without this branch, +// FetchResource("services", ...) would return the core Service from the +// typed informer and the requested group would never be consulted, leaking +// the wrong object via the AI surface. Mirrors handleGetResource's +// group-first dispatch in server.go. +func (s *Server) fetchAIResource(ctx context.Context, cache *k8s.ResourceCache, kind, namespace, name, group string) (runtime.Object, bool, error) { + if group != "" { + u, err := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if err != nil { + return nil, false, err + } + return u, true, nil + } + obj, err := k8s.FetchResource(cache, kind, namespace, name) + if err == nil { + return obj, false, nil + } + if err != k8s.ErrUnknownKind { + return nil, false, err + } + u, dynErr := cache.GetDynamicWithGroup(ctx, kind, namespace, name, group) + if dynErr != nil { + return nil, false, dynErr + } + return u, true, nil +} + +// writeAIFetchError maps fetch errors to HTTP status codes. Mirrors the +// previous inline behavior so consumers don't see a status-code drift. +func (s *Server) writeAIFetchError(w http.ResponseWriter, kind string, err error) { + msg := err.Error() + switch { + case strings.HasPrefix(msg, "forbidden:"): + s.writeError(w, http.StatusForbidden, fmt.Sprintf("insufficient permissions to access %s", kind)) + case strings.Contains(msg, "unknown resource kind"): + s.writeError(w, http.StatusBadRequest, msg) + case strings.Contains(msg, "not found"): + s.writeError(w, http.StatusNotFound, msg) + default: + // Unknown errors are server-side problems (e.g. "resource discovery + // not initialized", "dynamic resource cache not initialized") — surface + // as 500 so debugging upstream issues isn't masked by a misleading 404. + s.writeError(w, http.StatusInternalServerError, msg) + } +} + +// minifyForAI dispatches to the right Minify variant based on whether the +// resource is unstructured (CRD) or typed. +func minifyForAI(obj runtime.Object, isUnstructured bool, level aicontext.VerbosityLevel) (any, error) { + if isUnstructured { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, fmt.Errorf("internal: object marked unstructured but is %T", obj) + } + return aicontext.MinifyUnstructured(u, level), nil + } + return aicontext.Minify(obj, level) +} + +// buildAIResourceContext assembles the Options struct and calls Build. +// Returns the populated context — never nil unless obj is nil. +func (s *Server) buildAIResourceContext(r *http.Request, obj runtime.Object, kind, namespace, name string) *resourcecontext.ResourceContext { + if obj == nil { + return nil + } + cache := k8s.GetResourceCache() + + // Canonical kind from the resource's own TypeMeta (set at fetch). Pascal + // singular — matches what the audit check runner writes into Finding.Kind, + // so the audit index lookup keys correctly. Falls back to the URL kind + // only when TypeMeta is somehow empty; non-canonical input there would + // silently mis-key the audit lookup. + gvk := obj.GetObjectKind().GroupVersionKind() + canonicalKind := gvk.Kind + if canonicalKind == "" { + canonicalKind = kind + } + canonicalGroup := gvk.Group + + issueSum := computeIssueSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) + auditSum := computeAuditSummaryForResource(cache, canonicalGroup, canonicalKind, namespace, name) + + opts := resourcecontext.Options{ + Tier: resourcecontext.TierBasic, + AccessChecker: s.newRequestScopedChecker(r), + IssueSummary: issueSum, + AuditSummary: auditSum, + ServiceBackends: serviceBackendLookup{cache: cache}, + } + + // Wire the PolicyReport index when Kyverno is installed. Build emits a + // counts-only `policySummary.kyverno` on the basic tier; diagnostic + // tier (T10) will surface the top[] findings. + if idx := k8s.GetPolicyReportIndex(); idx != nil { + opts.PolicyReports = policyReportLookupAdapter{idx: idx} + } + + if topo, prov, dyn, ok := s.topologyForContext(namespace); ok { + opts.Topology = topo + opts.Provider = prov + opts.DynamicProv = dyn + // Relationships are computed inside Build via GetRelationshipsWithObject, + // which applies the same KindForGVK pseudo-kind remap we used to do + // here. Pre-computing in the handler doubled the work whenever the + // lookup returned nil (no edges): handler call returned nil, Build's + // `rel == nil && opts.Topology != nil` fallback re-ran the identical + // scan. Leaving opts.Relationships unset is the canonical path. + } + + return resourcecontext.Build(r.Context(), obj, opts) +} + +// topologyForContext builds (or fetches the memoized) topology scoped to the +// resource's namespace. Cluster-scoped resources get an all-namespaces build. +// Returns ok=false when the cache isn't ready yet. +func (s *Server) topologyForContext(namespace string) (*topology.Topology, topology.ResourceProvider, topology.DynamicProvider, bool) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, nil, false + } + opts := topology.DefaultBuildOptions() + if namespace != "" { + opts.Namespaces = []string{namespace} + } + opts.IncludeReplicaSets = true + opts.ForRelationshipCache = true + + provider := k8s.NewTopologyResourceProvider(cache) + dyn := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + topo, err := s.topoMemo.Get(opts, func() (*topology.Topology, error) { + return topology.NewBuilder(provider).WithDynamic(dyn).Build(opts) + }) + if err != nil || topo == nil { + return nil, nil, nil, false + } + return topo, provider, dyn, true +} + +// computeIssueSummaryForResource rolls up per-resource issue-composer rows +// (problem + condition + optional audit) into an IssueSummary. +// +// The composer is the canonical "what's wrong with this resource" surface — +// it merges problem detection (Deployment/DS/etc.), pod-level conditions, +// and generic CRD condition fallback. Filtering to a single (kind, name) +// is done client-side; the composer's native namespace filter restricts the +// scan to the resource's namespace so we don't walk the whole cluster. +// +// kind MUST be the Pascal singular form the issue composer writes into +// Issue.Kind (e.g. "Deployment", "Pod") — the caller derives it from obj's +// TypeMeta. The composer's Filters.Kinds matcher case-folds both sides, but +// it does NOT plural-to-singular convert, so URL forms ("deployments", +// "pods") drop every issue ("deployments" != lower("Deployment")) and the +// summary silently collapses to nil. +// +// Returns nil when no issues match — Build then omits the IssueSummary field. +func computeIssueSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.IssueSummary { + if cache == nil { + return nil + } + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + filters := issues.Filters{ + Kinds: []string{kind}, + Limit: issues.MaxLimit, + } + if namespace != "" { + filters.Namespaces = []string{namespace} + } + rows, _ := issues.ComposeWithStats(provider, filters) + + matched := make([]issues.Issue, 0, len(rows)) + bySource := make(map[string]int) + for _, row := range rows { + if row.Name != name { + continue + } + if namespace != "" && row.Namespace != namespace { + continue + } + // Group-aware match: T11 populates Issue.Group for problem + + // condition sources, so a Knative serving.knative.dev/Service + // lookup won't pull in the core Service's issues (or vice + // versa). The fromAudit / fromEvent sources emit Group="" today + // — those correctly match only the core-group lookup, which is + // the existing behavior. + if row.Group != group { + continue + } + matched = append(matched, row) + bySource[string(row.Source)]++ + } + if len(matched) == 0 { + return nil + } + // Sort by (severity desc, Reason asc) so TopReason is deterministic + // across runs even when multiple rows tie on severity. Mirrors the + // stable sort applied in computeAuditSummaryForResource. + sort.Slice(matched, func(i, j int) bool { + ri, rj := composeSeverityRank(matched[i].Severity), composeSeverityRank(matched[j].Severity) + if ri != rj { + return ri > rj + } + return matched[i].Reason < matched[j].Reason + }) + count := len(matched) + topSeverity := matched[0].Severity + topReason := matched[0].Reason + return &resourcecontext.IssueSummary{ + Count: count, + HighestSeverity: string(topSeverity), + TopReason: topReason, + BySource: bySource, + } +} + +// composeSeverityRank orders issues.Severity for highest-wins rollup. +func composeSeverityRank(s issues.Severity) int { + switch s { + case issues.SeverityCritical: + return 2 + case issues.SeverityWarning: + return 1 + } + return 0 +} + +// computeAuditSummaryForResource looks up audit findings for the subject +// resource via the canonical (Kind/ns/name) tuple. kind MUST be the Pascal +// singular form the audit check runner writes into Finding.Kind (e.g. "Pod", +// not "pod" or "pods") — the caller derives it from obj's TypeMeta. Without +// a Kind-aware key, a Deployment "web" in "prod" would inherit findings +// from a Service "web" in the same namespace, since map iteration in the +// previous implementation only compared (namespace, name). +// +// TopFinding is selected deterministically: highest severity wins, with +// CheckID as the ascending tiebreaker. Map iteration ordering does NOT +// influence the choice — agents pinning regression tests on +// resourceContext output rely on stable field values across runs. +func computeAuditSummaryForResource(cache *k8s.ResourceCache, group, kind, namespace, name string) *resourcecontext.AuditSummary { + if cache == nil || kind == "" { + return nil + } + // Match computeIssueSummaryForResource's guard: passing []string{""} to + // RunFromCache would filter to literally namespace="" resources instead + // of scanning all namespaces. Latent today since the audit suite + // doesn't cover cluster-scoped kinds, but the inconsistency would + // silently miss findings the moment a cluster-scoped check lands. + var namespaces []string + if namespace != "" { + namespaces = []string{namespace} + } + results := audit.RunFromCache(cache, namespaces, nil) + if results == nil || len(results.Findings) == 0 { + return nil + } + idx := bpaudit.IndexByResource(results.Findings) + match := idx[bpaudit.ResourceKey(group, kind, namespace, name)] + if len(match) == 0 { + return nil + } + + // Sort by (severity desc, CheckID asc) so TopFinding is deterministic + // across runs even when multiple findings tie on severity. + sort.Slice(match, func(i, j int) bool { + ri, rj := auditSeverityRank(match[i].Severity), auditSeverityRank(match[j].Severity) + if ri != rj { + return ri > rj + } + return match[i].CheckID < match[j].CheckID + }) + topFinding := match[0].CheckID + return &resourcecontext.AuditSummary{ + Count: len(match), + HighestSeverity: normalizeAuditSeverity(match[0].Severity), + TopFinding: topFinding, + } +} + +// normalizeAuditSeverity maps the audit suite's emission vocabulary +// ("danger" / "warning") onto the unified resourceContext severity +// scale ("critical" / "warning") used by issueSummary. Two sibling +// fields in the same response reporting severity in different +// vocabularies — "danger" vs "critical" — is a wire-shape footgun for +// consumers. Mirrors the same mapping internal/issues.fromAudit +// applies when audit findings flow through the unified issue stream. +// Empty / unknown severities pass through unchanged so the contract +// stays explicit if the audit suite ever grows new values. +func normalizeAuditSeverity(s string) string { + switch s { + case bpaudit.SeverityDanger: + return string(issues.SeverityCritical) + case bpaudit.SeverityWarning: + return string(issues.SeverityWarning) + } + return s +} + +// auditSeverityRank orders audit finding severities ("danger" > "warning"). +func auditSeverityRank(s string) int { + switch s { + case bpaudit.SeverityDanger: + return 2 + case bpaudit.SeverityWarning: + return 1 + } + return 0 } diff --git a/internal/server/ai_handlers_group_test.go b/internal/server/ai_handlers_group_test.go new file mode 100644 index 000000000..b6e0aabbe --- /dev/null +++ b/internal/server/ai_handlers_group_test.go @@ -0,0 +1,332 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + dynamicfake "k8s.io/client-go/dynamic/fake" + + "github.com/skyhook-io/radar/internal/k8s" +) + +// Group-qualified AI GET must route to the dynamic cache so CRDs whose +// plural shadows a core kind (Knative serving.knative.dev/Service vs +// core/v1 Service) resolve to the requested object — not whichever the +// typed cache happens to hold under that kind/name pair. +// +// Without the group-first branch in fetchAIResource, FetchResource( +// "services", ...) returns the core/v1 Service from the typed informer +// and ?group=serving.knative.dev is silently dropped. The bug surfaces +// as wrong-object disclosure on the AI surface: a caller asking for the +// Knative Service receives the core Service's spec + IP + selector +// instead. This pins the fix and would regress if the typed cache is +// consulted before the group qualifier. +// +// Same bug class as T12's group-blind root lookup, but on the single- +// resource GET path; ResourceContext relationship walks already disambig +// by group (see pkg/topology/managedby_test.go), so a regression here is +// the last remaining hot spot for kind/plural collisions on the GET API. +func TestAIGetResource_GroupRoutesToDynamic(t *testing.T) { + // Seed a Knative Service named "nginx" in "default" — same name+ns as + // the core Service registered in TestMain. Without ?group routing, the + // typed cache wins and returns the core Service. With it, the dynamic + // cache returns the Knative Service. + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + // Warm the informer so the Get() call below sees the seeded object + // without racing on initial sync. + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev&context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + // context=none returns the minified resource directly (no envelope). + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "serving.knative.dev/v1" { + t.Fatalf("apiVersion = %q, want serving.knative.dev/v1 — group qualifier was ignored "+ + "and the typed cache's core Service was returned instead", apiVersion) + } + kind, _ := body["kind"].(string) + if kind != "Service" { + t.Errorf("kind = %q, want Service", kind) + } + // Cross-check: the core Service has a Spec.Selector / ClusterIP shape + // that the Knative seed does NOT have. A regression that returned the + // core Service would carry those fields here. + spec, _ := body["spec"].(map[string]any) + if _, hasSelector := spec["selector"]; hasSelector { + t.Errorf("response carries Service.spec.selector — looks like the core Service leaked through "+ + "despite ?group=serving.knative.dev; body=%+v", body) + } +} + +// Group-qualified AI GET must also route the topology relationship lookup +// to the matching pseudo-kind node. The bug: handleAIGetResource passed the +// URL plural "services" straight into topology.GetRelationshipsWithObject, +// which feeds buildNodeID — and buildNodeID's kindMap resolves "services" +// to "service", landing on the CORE Service's topology node. For a Knative +// Service request, the response then carried the core Service's incoming +// Ingress edge as resourceContext.exposes, which is provably wrong. +// +// Fix: derive a topology-pseudo-kind via topology.KindForGVK(gvk.Kind, +// gvk.Group) — for Knative Service, that yields "knativeservice", whose +// node has no Ingress edge in this fixture and therefore no Exposes. +// +// Differentiator: the TestMain fixture seeds an Ingress backend-ref'd to +// the core Service "nginx" in "default". The Knative Service "nginx" in +// "default" (seeded below into the dynamic cache) is a separate topology +// node with NO incoming Ingress edges. The test asserts that the +// resourceContext returned for the ?group=serving.knative.dev request +// does NOT advertise that Ingress — the same fixture, when queried +// without ?group, DOES surface it (locked down by the trailing sub-test +// to pin the regression's pre-fix shape and prevent a future change that +// silently drops the core-side relationship as well). +func TestAIGetResource_GroupRoutesRelationshipsToKnative(t *testing.T) { + knativeGVR := schema.GroupVersionResource{Group: "serving.knative.dev", Version: "v1", Resource: "services"} + knativeSvc := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": map[string]any{ + "name": "nginx", + "namespace": "default", + }, + "spec": map[string]any{ + "template": map[string]any{ + "spec": map[string]any{ + "containers": []any{ + map[string]any{"image": "gcr.io/example/hello:1"}, + }, + }, + }, + }, + }, + } + dyn := dynamicfake.NewSimpleDynamicClientWithCustomListKinds( + runtime.NewScheme(), + map[schema.GroupVersionResource]string{knativeGVR: "ServiceList"}, + knativeSvc, + ) + resources := []k8s.APIResource{ + { + Group: "serving.knative.dev", + Version: "v1", + Kind: "Service", + Name: "services", + Namespaced: true, + IsCRD: true, + Verbs: []string{"get", "list", "watch"}, + }, + } + if err := k8s.InitTestDynamicResourceCache(dyn, resources); err != nil { + t.Fatalf("InitTestDynamicResourceCache: %v", err) + } + t.Cleanup(k8s.ResetTestDynamicState) + + dynCache := k8s.GetDynamicResourceCache() + if dynCache == nil { + t.Fatal("dynamic cache not initialized") + } + if err := dynCache.EnsureWatching(knativeGVR); err != nil { + t.Fatalf("EnsureWatching: %v", err) + } + if !dynCache.WaitForSync(knativeGVR, 5*time.Second) { + t.Fatal("timed out waiting for Knative Service informer sync") + } + + // The Knative Service request MUST NOT inherit the core Service's + // Ingress in resourceContext.exposes. Pre-fix, the URL "services" was + // passed into buildNodeID and resolved to "service/default/nginx" — + // the wrong topology node — so the Ingress leaked. + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?group=serving.knative.dev") + if err != nil { + t.Fatalf("GET (knative): %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var knBody map[string]any + if err := json.NewDecoder(resp.Body).Decode(&knBody); err != nil { + t.Fatalf("decode (knative): %v", err) + } + knRC, _ := knBody["resourceContext"].(map[string]any) + if knRC == nil { + t.Fatal("knative response missing resourceContext") + } + exposes, _ := knRC["exposes"].([]any) + for _, e := range exposes { + em, _ := e.(map[string]any) + kind, _ := em["kind"].(string) + name, _ := em["name"].(string) + if kind == "Ingress" && name == "nginx-ingress" { + t.Fatalf("knative-routed request leaked the core Service's Ingress into resourceContext.exposes "+ + "(got %+v) — relationship lookup did NOT remap to the knativeservice topology node; "+ + "check that handleAIGetResource is funneling kind through topology.KindForGVK", exposes) + } + } + + // Co-anchored sibling: when no ?group is passed, the same path resolves + // to the core Service node and MUST still surface the Ingress. This + // half guards against an over-correction that nukes the relationship + // lookup for the dominant typed-cache case while fixing the CRD case. + respCore, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx") + if err != nil { + t.Fatalf("GET (core): %v", err) + } + defer respCore.Body.Close() + var coreBody map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreBody); err != nil { + t.Fatalf("decode (core): %v", err) + } + coreRC, _ := coreBody["resourceContext"].(map[string]any) + coreExposes, _ := coreRC["exposes"].([]any) + foundIngress := false + for _, e := range coreExposes { + em, _ := e.(map[string]any) + if em["kind"] == "Ingress" && em["name"] == "nginx-ingress" { + foundIngress = true + break + } + } + if !foundIngress { + t.Errorf("core Service request lost the Ingress from resourceContext.exposes (got %+v) — "+ + "the fix overshot and broke the typed-cache relationship lookup", coreExposes) + } +} + +// Pin Finding 1: the AI GET handler used to pass the URL-plural kind +// ("deployments") into computeIssueSummaryForResource, which forwards +// it to issues.Compose via Filters.Kinds. The composer's applyFilters +// case-folds both sides (strings.ToLower) but does NOT plural-to-singular +// convert — and Issue.Kind is the canonical Pascal singular ("Deployment"). +// So the filter set {"deployments"} never matched lower("Deployment") = +// "deployment", every issue got dropped, and IssueSummary.Count silently +// collapsed to 0 (Build then omits the field entirely). +// +// Fix: pass canonicalKind (derived from obj.GVK) into +// computeIssueSummaryForResource so the filter is "Deployment" → matched. +// +// Fixture: TestMain seeds Deployment broken/stuck-app with +// UnavailableReplicas=3. DetectProblems emits a Pascal-singular +// "Deployment" problem for it. Hitting /api/ai/resources/deployments/... +// (URL plural) must surface the issue in resourceContext.issueSummary +// with count > 0 — pre-fix this came back as null. +func TestAIGetResource_IssueSummaryCountsURLPluralKind(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/deployments/broken/stuck-app") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + rc, _ := body["resourceContext"].(map[string]any) + if rc == nil { + t.Fatal("response missing resourceContext") + } + issueSum, _ := rc["issueSummary"].(map[string]any) + if issueSum == nil { + t.Fatalf("resourceContext.issueSummary is nil — composer filter dropped every issue. "+ + "Likely the handler is still passing URL-plural kind ('deployments') into "+ + "computeIssueSummaryForResource instead of canonical Pascal singular ('Deployment'). "+ + "Got: %+v", rc) + } + count, _ := issueSum["count"].(float64) + if count < 1 { + t.Fatalf("issueSummary.count = %v, want >= 1 — DetectProblems should have flagged "+ + "the broken/stuck-app Deployment (UnavailableReplicas=3)", count) + } +} + +// Happy-path sibling for the test above: when no group is passed, the +// typed-cache-first path is correct (and must continue to be — the v1 +// core Service is the dominant case and must not pay a dynamic-cache +// detour just because the group-qualified branch was added). +func TestAIGetResource_NoGroupHitsTypedCache(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/ai/resources/services/default/nginx?context=none") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + apiVersion, _ := body["apiVersion"].(string) + if apiVersion != "v1" { + t.Fatalf("apiVersion = %q, want v1 (core Service) on no-group request", apiVersion) + } +} diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go new file mode 100644 index 000000000..24b0dac5e --- /dev/null +++ b/internal/server/ai_handlers_rbac_test.go @@ -0,0 +1,263 @@ +package server + +import ( + "encoding/json" + "net/http" + "testing" + + "github.com/skyhook-io/radar/internal/auth" +) + +// RBAC preflight at /api/ai/resources/*. +// +// Two surfaces share preflight helpers with the equivalent REST handlers +// (preflightResourceGet for single-resource GETs from PR #721, +// preflightResourceList for list paths from PR #722). These tests pin +// the AI endpoints' gates so a regression that bypassed a helper on +// the AI side surfaces here even when the REST tests still pass. +// +// Where the REST list path returns 200 with `[]` for denies (legacy SPA +// shape that doesn't leak kind existence), the AI list path returns the +// explicit status so agents see the failure instead of confusing +// "empty cluster" output. + +// -- AI single-resource GET (T6 / PR #721) --------------------------------- + +func TestProxyAuth_AIGetSecret_PerNamespaceRBAC_Denied(t *testing.T) { + // alice has namespace access to "default" but the per-namespace + // canRead("","secrets","default","get") returns false. The cache holds + // nginx-tls (seeded as the SA which has cluster-wide secrets RBAC), + // so without the preflight a 200 would leak secret bytes. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretGetCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secret/default/nginx-tls", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-secret without per-ns get SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetNode_ClusterScopedRBAC_Denied(t *testing.T) { + // Node is cluster-scoped — the AI GET must require per-kind get-node SAR. + // AllowedNamespaces==nil (cluster-wide-namespace sentinel) is NOT a + // license to read cluster-scoped kinds: that's the exact conflation the + // preflight helper guards against. A regression that dropped the + // ClassifyKindScope arm would let nodes through here. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("get", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/node/_/worker-1", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-node without cluster-scoped get-node SAR, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceDenied(t *testing.T) { + // alice has namespace access only to "default" — a get against a pod + // in "kube-system" must 403 BEFORE any fetch, matching handleGetResource. + // A regression that fetched first and then filtered would let timing + // signal whether the pod exists (oracle). + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/kube-system/some-pod", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Errorf("expected 403 for AI get-pod in disallowed namespace, got %d", resp.StatusCode) + } +} + +func TestProxyAuth_AIGetPod_NamespaceAllowed(t *testing.T) { + // Sanity check: a user with namespace access AND who hits an existing + // resource gets a 200 with the {resource, resourceContext} envelope. + // Pins that the preflight isn't accidentally over-gating happy-path + // requests (e.g., a misordered check that always denies). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/pods/default/nginx-abc-xyz", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200 on allowed AI get-pod, got %d", resp.StatusCode) + } + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode body: %v", err) + } + if _, ok := body["resource"]; !ok { + t.Errorf("expected 'resource' field in AI get response, got: %+v", body) + } + if _, ok := body["resourceContext"]; !ok { + t.Errorf("expected 'resourceContext' field in AI get response, got: %+v", body) + } +} + +// -- AI list (T89 / PR #722) ----------------------------------------------- + +func TestAI_SecretsList_PerNamespaceDenied_Returns403(t *testing.T) { + // alice has namespace access to default but per-namespace + // `list secrets` is denied. preflightResourceList must intercept + // before reaching the cache. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretListCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secrets?namespace=default", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI secrets list with per-namespace deny, got %d", resp.StatusCode) + } +} + +func TestAI_NodesList_NoClusterRBAC_Returns403(t *testing.T) { + // Nodes are cluster-scoped. Cluster-wide pod visibility + // (AllowedNamespaces nil sentinel) is not a license to read + // cluster-scoped kinds — the SAR-level gate must reject. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/nodes", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI nodes list without cluster-scope RBAC, got %d", resp.StatusCode) + } +} + +func TestAI_NamespacesList_NoListNamespacesSAR_Returns403(t *testing.T) { + // /api/ai/resources/namespaces returns full Namespace objects. + // Strict SAR gate — cluster-wide pod RBAC alone is not sufficient. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "namespaces", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/namespaces", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI namespaces list without list-namespaces SAR, got %d", resp.StatusCode) + } +} + +// TestAI_ListServices_WithGroup_RoutesToDynamicCache pins the group-aware +// short-circuit in handleAIListResources. For kind=services with no group, +// the typed core Service list path returns the seeded nginx Service. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// aiListDynamic instead. Mirrors the same fix on GET in PR #721. +// +// The smoke TestMain seeds typed caches only; the dynamic resource cache +// isn't initialized, so the dynamic path surfaces a 500 with "resource +// discovery not initialized". That 500 IS the assertion: pre-fix the +// handler would return 200 with the core Service rows (silent +// wrong-kind result), which is the bug. +func TestAI_ListServices_WithGroup_RoutesToDynamicCache(t *testing.T) { + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + // Baseline: no group → typed cache returns the seeded core Service. + respCore := env.authGet(t, "/api/ai/resources/services?namespace=default", "bob", "") + defer respCore.Body.Close() + if respCore.StatusCode != http.StatusOK { + t.Fatalf("baseline (no group): expected 200, got %d", respCore.StatusCode) + } + var coreRows []map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreRows); err != nil { + t.Fatalf("decode core: %v", err) + } + var foundNginxSvc bool + for _, row := range coreRows { + if row["kind"] == "Service" && row["name"] == "nginx" { + foundNginxSvc = true + break + } + } + if !foundNginxSvc { + t.Fatalf("baseline (no group): expected nginx Service in typed list, got %+v", coreRows) + } + + // With group: must route through aiListDynamic. Dynamic cache isn't + // initialized in the smoke harness, so we expect either 400 ("unknown + // resource kind") or 500 ("dynamic resource cache not initialized" / + // "resource discovery not initialized") — anything BUT a 200 with + // core Services, which is the pre-fix wrong-result path. + respCRD := env.authGet(t, "/api/ai/resources/services?namespace=default&group=serving.knative.dev", "bob", "") + defer respCRD.Body.Close() + if respCRD.StatusCode == http.StatusOK { + var crdRows []map[string]any + if err := json.NewDecoder(respCRD.Body).Decode(&crdRows); err == nil { + for _, row := range crdRows { + if row["name"] == "nginx" { + t.Fatalf("group=serving.knative.dev leaked typed core Service into result (pre-fix bug): row=%+v", row) + } + } + } + } + if respCRD.StatusCode != http.StatusBadRequest && respCRD.StatusCode != http.StatusInternalServerError && respCRD.StatusCode != http.StatusOK { + t.Fatalf("group=serving.knative.dev: unexpected status %d (want 400/500 from uninitialized dynamic cache, or 200 with non-core rows)", respCRD.StatusCode) + } +} + +func TestAI_DeploymentsList_HappyPath_AttachesSummaryContext(t *testing.T) { + // Allowed user, summary-verbosity default. The envelope must + // include the seeded nginx deployment AND each row must carry a + // summaryContext field (the load-bearing new wire shape this PR + // adds — pin it so a refactor that skipped attachment surfaces + // here). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/deployments", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + var rows []map[string]any + if err := json.NewDecoder(resp.Body).Decode(&rows); err != nil { + t.Fatalf("decode: %v", err) + } + if len(rows) == 0 { + t.Fatalf("allowed user got 0 deployments, expected seeded nginx") + } + + // AI list rows are flat (kind/name/namespace at the top level — + // the minified shape, distinct from the REST handler's K8s-native + // metadata-nested objects). Find the nginx row and assert + // summaryContext is present. Empty map is acceptable (the + // deployment is healthy and not managed by an external + // controller) — what matters is the envelope field exists so + // consumers don't have to special-case its absence. + var found bool + for _, row := range rows { + if row["name"] != "nginx" { + continue + } + found = true + if _, has := row["summaryContext"]; !has { + t.Errorf("nginx row missing summaryContext envelope: %+v", row) + } + } + if !found { + t.Errorf("nginx deployment not in AI list response: %+v", rows) + } +} diff --git a/internal/server/ai_handlers_severity_test.go b/internal/server/ai_handlers_severity_test.go new file mode 100644 index 000000000..abeadcef8 --- /dev/null +++ b/internal/server/ai_handlers_severity_test.go @@ -0,0 +1,27 @@ +package server + +import ( + "testing" + + "github.com/skyhook-io/radar/internal/issues" + bpaudit "github.com/skyhook-io/radar/pkg/audit" +) + +// Pin the audit→issue severity normalization on the AuditSummary wire. +// Without it, sibling resourceContext fields disagree on what "highest +// severity" means: audit emits "danger" while issueSummary emits +// "critical". Mirror the same mapping internal/issues.fromAudit uses +// for the unified issue stream so consumers see one vocabulary. +func TestNormalizeAuditSeverity(t *testing.T) { + cases := map[string]string{ + bpaudit.SeverityDanger: string(issues.SeverityCritical), + bpaudit.SeverityWarning: string(issues.SeverityWarning), + "": "", // empty stays empty — explicit contract + "unknown": "unknown", // future audit values pass through + } + for in, want := range cases { + if got := normalizeAuditSeverity(in); got != want { + t.Errorf("normalizeAuditSeverity(%q) = %q, want %q", in, got, want) + } + } +} diff --git a/internal/server/audit_handlers.go b/internal/server/audit_handlers.go index edf22d2d8..5465e638d 100644 --- a/internal/server/audit_handlers.go +++ b/internal/server/audit_handlers.go @@ -130,12 +130,16 @@ func (s *Server) handleAuditResource(w http.ResponseWriter, r *http.Request) { results = applyAuditSettings(results, getAuditConfig()) index := bp.IndexByResource(results.Findings) - // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment") - findings := index[bp.ResourceKey(kind, namespace, name)] + // Try exact kind first, then map API resource name (e.g. "deployments") to Go kind (e.g. "Deployment"). + // This handler is the UI's per-resource audit drill-down — group isn't on + // the URL today (the UI doesn't list grouped CRDs here yet), so we look + // up with group="" which matches the built-ins the audit suite scans. + // When CRD audit lands (#35 follow-up), thread group through the URL. + findings := index[bp.ResourceKey("", kind, namespace, name)] if findings == nil { goKind := apiResourceToKind(kind) if goKind != kind { - findings = index[bp.ResourceKey(goKind, namespace, name)] + findings = index[bp.ResourceKey("", goKind, namespace, name)] } } if findings == nil { diff --git a/internal/server/rc_rbac.go b/internal/server/rc_rbac.go new file mode 100644 index 000000000..13c808055 --- /dev/null +++ b/internal/server/rc_rbac.go @@ -0,0 +1,88 @@ +package server + +import ( + "context" + "net/http" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// requestScopedChecker adapts Server.canRead into resourcecontext.RefAccessChecker +// with a request-local memoization layer keyed on (verb, group, kind, namespace). +// +// A single resourceContext build emits ~30 candidate refs but only ~5 distinct +// (group, kind, namespace) tuples — most workloads point at ConfigMaps and +// Secrets in their own namespace, plus a ServiceAccount and a Node. Caching +// here collapses the SAR fan-out before reaching s.canRead's per-user cache. +// +// The map is intentionally request-scoped (not server-scoped): server-scoped +// caching is already in pkg/auth.PermissionCache (2-min TTL) and reused via +// s.canRead. The per-request layer exists only to deduplicate the burst this +// builder generates within a single response. +type requestScopedChecker struct { + s *Server + req *http.Request + cache map[string]bool +} + +// newRequestScopedChecker returns a checker scoped to a single HTTP request. +// Not safe for concurrent use across requests; each handler invocation MUST +// construct its own checker. +func (s *Server) newRequestScopedChecker(r *http.Request) *requestScopedChecker { + return &requestScopedChecker{ + s: s, + req: r, + cache: make(map[string]bool, 8), + } +} + +// CanRead implements resourcecontext.RefAccessChecker. +// +// Authorization rules: +// - Namespaced kinds: SAR on (verb=get, group, resource, namespace). +// - Cluster-scoped kinds (namespace == ""): SAR on (verb=get, group, resource, ""). +// - Unknown kinds (not in discovery, not in static catalogue) pass through — +// mirrors the rest of the codebase's unknown-kind passthrough semantics. +// This is safe because Build only emits refs whose kinds are known to the +// topology builder (which itself uses discovery); a kind unknown here is a +// temporary discovery-cold state, not a permission bypass vector. +func (c *requestScopedChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + key := "get|" + group + "|" + kind + "|" + namespace + if v, ok := c.cache[key]; ok { + return v + } + + resource := lookupResourceName(kind, group) + if resource == "" { + // Unknown kind — passthrough. See doc comment for rationale. + c.cache[key] = true + return true + } + + allowed := c.s.canRead(c.req, group, resource, namespace, "get") + c.cache[key] = allowed + return allowed +} + +// Compile-time assertion that requestScopedChecker satisfies the contract. +var _ resourcecontext.RefAccessChecker = (*requestScopedChecker)(nil) + +// lookupResourceName resolves a (kind, group) pair to the canonical plural +// resource name used by SubjectAccessReview. Tries the static cluster-only +// catalogue (covers Nodes / ClusterRoles / etc.), then discovery for everything +// else including CRDs. Returns "" when neither path knows the kind. +func lookupResourceName(kind, group string) string { + if kind == "" { + return "" + } + if g, r, ok := k8s.ClusterOnlyKindGVR(kind); ok && (group == "" || group == g) { + return r + } + if disc := k8s.GetResourceDiscovery(); disc != nil { + if ar, ok := disc.GetResourceWithGroup(kind, group); ok { + return ar.Name + } + } + return "" +} diff --git a/internal/server/search_handler.go b/internal/server/search_handler.go index 486224ae4..7e5f56f4e 100644 --- a/internal/server/search_handler.go +++ b/internal/server/search_handler.go @@ -102,6 +102,23 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { return s.canRead(r, group, resource, "", "list") }, } + // summaryContext attaches managedBy/health/issueCount per hit. Build + // the per-request closure once (one Compose call + cached topology + // snapshot) and let the search executor invoke it per kept hit. + // ?context=none opts out so legacy callers don't pay for the join. + // + // Search uses the dual-index variant: hits are mixed-kind in one + // response (namespaced Pods alongside cluster-scoped Nodes), so a + // single-namespace-scoped issue index would zero issueCount on + // cluster-scoped hits (whose issues live at namespace=""). The + // builder routes per-hit by scope. SAR gating above + // (CanReadClusterScoped) already constrains which cluster-scoped + // kinds are reachable. + if r.URL.Query().Get("context") != "none" { + if builder := s.newSearchSummaryContextBuilder(scanNamespaces); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } if expr := r.URL.Query().Get("filter"); expr != "" { f, err := filter.CachedObjectFilter(expr) if err != nil { diff --git a/internal/server/server.go b/internal/server/server.go index b4e350187..6566d2f68 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1010,17 +1010,20 @@ func (s *Server) handleAPIResources(w http.ResponseWriter, r *http.Request) { s.writeJSON(w, resources) } -func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // parseNamespacesForUser primes the per-user perm cache (triggers - // DiscoverNamespaces if needed). canRead below relies on it. - namespaces := s.parseNamespacesForUser(r) - +// preflightResourceList runs the per-user RBAC gates shared by the REST +// (/api/resources/{kind}) and AI (/api/ai/resources/{kind}) list paths. +// It assumes the caller has already populated `namespaces` via +// parseNamespacesForUser (which primes the canI cache that canRead relies on) +// and has classified the kind for cluster-scope. +// +// Returns the (possibly-rewritten) namespace slice that downstream cache +// reads should use. When ok=false the gate denied or the user has no +// namespace access; (status, msg) carry the canonical HTTP response. REST +// callers historically convert denies to a 200 with `[]` to avoid leaking +// kind existence; the AI path returns the explicit status so agents see the +// failure. Same gates run in the same order on both paths — the response +// shape is the only thing that differs. +func (s *Server) preflightResourceList(r *http.Request, kind, group string, namespaces []string) (finalNamespaces []string, status int, msg string, ok bool) { // "namespaces" is cluster-scoped at the K8s API. Full Namespace objects // (labels, annotations, spec) require explicit list-namespaces SAR. // AllowedNamespaces is NOT a sufficient fallback: list-pods-in-alpha @@ -1032,10 +1035,9 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { isNamespacesKind := kind == "namespaces" || kind == "namespace" if isNamespacesKind { if !s.canRead(r, "", "namespaces", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list namespaces", false } - namespaces = nil // full lister output for SAR-authorized users + return nil, 0, "", true // full lister output for SAR-authorized users } // Cluster-only kinds (Nodes, PVs, StorageClasses, ClusterRoles, cluster- @@ -1043,19 +1045,19 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // noNamespaceAccess check so a user with explicit cluster-scoped RBAC but // no namespace access can still read those resources. isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) - if isClusterScoped && !isNamespacesKind { + if isClusterScoped { if !s.canRead(r, gvrGroup, gvrResource, "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, fmt.Sprintf("insufficient permissions to list %s", kind), false } // Cluster-scoped reads have no namespace dimension. Once the // resource-level SAR passes, force the later typed/dynamic cache paths // through their cluster-wide branch even if the user also has a // namespace view preference. - namespaces = nil - } else if !isNamespacesKind && noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) - return + return nil, 0, "", true + } + + if noNamespaceAccess(namespaces) { + return namespaces, http.StatusForbidden, "no namespace access", false } // Per-kind RBAC inside a namespace. Helm release storage IS K8s Secrets, @@ -1064,26 +1066,48 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // radar/templates/clusterrole.yaml). When any of those triggers fires // the cache holds every secret in the cluster, so per-user RBAC must // gate the read. Other namespaced kinds are deferred. - if (kind == "secrets" || kind == "secret") && !isClusterScoped { + if kind == "secrets" || kind == "secret" { if auth.UserFromContext(r.Context()) != nil { if namespaces == nil { // Auth user with cluster-wide namespace access (e.g. picked up // via DiscoverNamespaces stage 1: cluster-wide list pods). The // cache will serve all secrets — gate on cluster-scope SAR. if !s.canRead(r, "", "secrets", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list secrets", false } } else { namespaces = s.filterNamespacesByCanRead(r, "", "secrets", "list", namespaces) if len(namespaces) == 0 { - s.writeJSON(w, []any{}) - return + return namespaces, http.StatusForbidden, "insufficient permissions to list secrets", false } } } } + return namespaces, 0, "", true +} + +func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // parseNamespacesForUser primes the per-user perm cache (triggers + // DiscoverNamespaces if needed). canRead below relies on it. + namespaces := s.parseNamespacesForUser(r) + + // Shared RBAC gate. REST converts denies to 200 with `[]` (legacy shape + // the SPA tolerates and that doesn't leak kind existence); the AI path + // returns the explicit status. + finalNamespaces, _, _, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeJSON(w, []any{}) + return + } + namespaces = finalNamespaces + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") @@ -1470,29 +1494,23 @@ func setTypeMeta(resource any) { k8s.SetTypeMeta(resource) } -func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - namespace := chi.URLParam(r, "namespace") - name := chi.URLParam(r, "name") - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace - if namespace == "_" { - namespace = "" - } - - // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are - // gated per-kind via SAR. Run BEFORE the namespace access check so - // users with explicit cluster-scoped RBAC but no namespace access can - // still get the resource. ClassifyKindScope catches both static cluster- - // only kinds and dynamic cluster-scoped CRDs (via discovery). - // - // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user - // filtered list — gate the GET via the user's namespace access for the - // requested name, not via cluster-scoped SAR. +// preflightResourceGet runs the per-user RBAC gates that must pass before any +// single-resource GET fetch. Mirrors the kind/scope-aware logic used by both +// the REST handler (handleGetResource) and the AI handler (handleAIGetResource) +// so future RBAC adjustments stay in lockstep across both surfaces. +// +// Inputs are the already-normalized (kind, namespace, name, group); callers +// must collapse the cluster-scoped "_" placeholder before calling. Returns +// (status, message, ok=true) when the request passes the gates, or +// (status, message, ok=false) with the HTTP status + body the caller should +// emit on deny. +// +// Three gates, run in this order: +// 1. kind == "namespaces" → full Namespace object requires get-namespaces SAR +// 2. cluster-scoped (Node/CRD/…) → per-kind get SAR (ClassifyKindScope) +// 3. namespaced → namespace access via getUserNamespaces, +// plus per-namespace get SAR for Secrets +func (s *Server) preflightResourceGet(r *http.Request, kind, namespace, name, group string) (int, string, bool) { isNamespacesKind := kind == "namespaces" || kind == "namespace" isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) switch { @@ -1502,30 +1520,56 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { // imply read access to the Namespace object itself. Restricted users // without ClusterRole on namespaces get 403 here. if !s.canRead(r, "", "namespaces", "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", name), false } case isClusterScoped: if !s.canRead(r, gvrGroup, gvrResource, "", "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind)) - return + return http.StatusForbidden, fmt.Sprintf("no access to %s (cluster-scoped resource requires explicit RBAC)", kind), false } case namespace != "": // Namespaced kind: verify namespace access. allowed := s.getUserNamespaces(r, []string{namespace}) if noNamespaceAccess(allowed) { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to namespace %q", namespace), false } // Per-kind RBAC inside the namespace for Secrets — the chart can // grant the SA cluster-wide secrets (Helm release visibility), so // namespace-list discovery is not a sufficient gate here. The list // handler has the matching list-SAR. if (kind == "secrets" || kind == "secret") && !s.canRead(r, "", "secrets", namespace, "get") { - s.writeError(w, http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace)) - return + return http.StatusForbidden, fmt.Sprintf("no access to secrets in namespace %q", namespace), false } } + return 0, "", true +} + +func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // Handle cluster-scoped resources: "_" is used as placeholder for empty namespace + if namespace == "_" { + namespace = "" + } + + // Cluster-scoped GETs (Node, ClusterRole, cluster-scoped CRDs, …) are + // gated per-kind via SAR. Run BEFORE the namespace access check so + // users with explicit cluster-scoped RBAC but no namespace access can + // still get the resource. ClassifyKindScope catches both static cluster- + // only kinds and dynamic cluster-scoped CRDs (via discovery). + // + // "namespaces" is cluster-scoped at the K8s API but exposed as a per-user + // filtered list — gate the GET via the user's namespace access for the + // requested name, not via cluster-scoped SAR. + if status, msg, ok := s.preflightResourceGet(r, kind, namespace, name, group); !ok { + s.writeError(w, status, msg) + return + } cache := k8s.GetResourceCache() if cache == nil { diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 90dcbef37..268283396 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -11,6 +11,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -28,6 +29,7 @@ var ( func TestMain(m *testing.M) { replicas := int32(1) + brokenReplicas := int32(3) deployUID := "deploy-uid-1234" rsUID := "rs-uid-5678" @@ -37,6 +39,37 @@ func TestMain(m *testing.M) { ObjectMeta: metav1.ObjectMeta{Name: "default"}, Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, }, + &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "broken"}, + Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, + }, + // Broken Deployment in its own namespace so it doesn't perturb the + // "default" fixture used by every other smoke test. Used by + // TestAIGetResource_IssueSummaryCountsURLPluralKind to assert the + // composer's URL-plural-kind filter actually matches the canonical + // Pascal-singular Issue.Kind values — pre-fix, count was 0. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stuck-app", + Namespace: "broken", + Labels: map[string]string{"app": "stuck"}, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &brokenReplicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "stuck"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "stuck"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "stuck", Image: "registry.example/stuck:1"}}}, + }, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 3, + AvailableReplicas: 0, + UnavailableReplicas: 3, + }, + }, &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "nginx", @@ -137,6 +170,37 @@ func TestMain(m *testing.M) { Ports: []corev1.ServicePort{{Port: 80, TargetPort: intstr.FromInt(80)}}, }, }, + // Ingress routing to the core Service "nginx". Used by + // TestAIGetResource_GroupRoutesRelationshipsToKnative to give the + // core Service a distinct incoming edge (EdgeRoutesTo) that the + // Knative Service node does NOT inherit — the test compares whether + // the AI GET handler picks up that edge under ?group=serving.knative.dev + // (regression for the kind-passed-to-relationship-lookup bug). + &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nginx-ingress", + Namespace: "default", + }, + Spec: networkingv1.IngressSpec{ + Rules: []networkingv1.IngressRule{{ + Host: "nginx.example.com", + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{{ + Path: "/", + PathType: func() *networkingv1.PathType { p := networkingv1.PathTypePrefix; return &p }(), + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: "nginx", + Port: networkingv1.ServiceBackendPort{Number: 80}, + }, + }, + }}, + }, + }, + }}, + }, + }, // Seed Secrets in two namespaces so per-user RBAC tests can // distinguish "gate denied → []" from "no secrets in cache" and can // exercise the partial-allow case (one ns allowed, the other denied). diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go new file mode 100644 index 000000000..6853930a1 --- /dev/null +++ b/internal/server/summary_context.go @@ -0,0 +1,70 @@ +// Per-request helpers that compute the compact ResourceSummaryContext attached +// to /api/ai/resources/{kind} list rows and /api/search hits. +// +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the REST-specific wrapper — +// it sources topology from the server-wide broadcaster cache and +// otherwise just plumbs arguments through. + +package server + +import ( + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/summarycontext" +) + +// newResourceSummaryContextBuilder assembles the per-request closure for the +// list/search handlers. Returns nil when the cache or topology isn't +// available, in which case callers should skip context attachment +// rather than emit empty objects. +// +// Callers pass the namespace list they're scanning so the issue index +// is scoped to just those rows (the full Compose call on a 100-namespace +// cluster is fine; this is mostly belt-and-suspenders for very large +// envs). Pass nil to compose cluster-wide. +// +// Use newSearchSummaryContextBuilder for search, which routes per-hit +// between a namespaced and a cluster-wide index — search returns mixed +// kinds in one response, so a single index can't get both right. +func (s *Server) newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + idx := summarycontext.BuildIssueIndex(provider, namespaces) + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), idx, idx) +} + +// newSearchSummaryContextBuilder is the search-specific variant. Search +// hits are MIXED-kind in one response — a single query can return both +// namespaced Pods and cluster-scoped Nodes. A single issue index can't +// be both: scoped to the user's namespaces it would silently zero +// issueCount on Node/PV/cluster-scoped CRD hits (whose issues live at +// namespace=""); composed cluster-wide it would over-count or pull in +// rows the namespace-restricted user shouldn't see. +// +// Fix: build two indexes per request. namespacedIdx is scoped to +// scanNamespaces (intersection of user RBAC and the query's `ns:` +// modifier). clusterIdx is composed cluster-wide (nil filter) so +// namespace="" issues surface. The returned closure dispatches per-hit +// via k8s.ClassifyKindScope(kind, group). Search-level RBAC +// (CanReadClusterScoped) already gated which cluster-scoped kinds the +// user can see, so the cluster-wide index doesn't expose unauthorized +// rows. +// +// The cluster-wide index is skipped when scanNamespaces is already nil +// (cluster-wide user) — both indexes would be identical, so one pass +// suffices. +func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) + } + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), namespacedIdx, clusterIdx) +} diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go new file mode 100644 index 000000000..af4fb3f22 --- /dev/null +++ b/internal/server/summary_context_test.go @@ -0,0 +1,231 @@ +// Wiring tests for the REST-side ResourceSummaryContext builders. The pure- +// function tests (issueIndex key arithmetic, BuildIssueIndex over a +// fake provider, CanonicalSingular, ManagedByFromRelationships) live in +// internal/summarycontext alongside the shared core they exercise. + +package server + +import ( + "encoding/json" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/summarycontext" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// stubBuilder records calls and returns a deterministic ResourceSummaryContext +// keyed by the resource identity. Avoids standing up a topology cache or +// issue provider — those are exercised by the per-layer unit tests. +// +// Key shape mirrors the production issueIndexKey (group|kind|ns|name) +// so test fixtures pin the group-aware lookup. +func stubBuilder(t *testing.T, want map[string]*resourcecontext.ResourceSummaryContext) summarycontext.Builder { + t.Helper() + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + key := group + "|" + kind + "|" + namespace + "|" + name + return want[key] + } +} + +// TestAttachResourceSummaryContextToList wires together MinifyList + the +// per-row attach helper and asserts the ResourceSummaryContext field lands in +// the JSON each row marshals to. +func TestAttachResourceSummaryContextToList(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-1", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning, ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}}, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-2", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + } + // Group is "" for core-group Pods. + want := map[string]*resourcecontext.ResourceSummaryContext{ + "|Pod|prod|api-1": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "healthy", + IssueCount: 0, + }, + "|Pod|prod|api-2": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "unhealthy", + IssueCount: 3, + }, + } + + results, err := aicontext.MinifyList(objs, aicontext.LevelSummary) + if err != nil { + t.Fatalf("MinifyList: %v", err) + } + summarycontext.AttachToTypedList(results, objs, stubBuilder(t, want)) + + // Row 0 — healthy pod. + b, _ := json.Marshal(results[0]) + wantSubs := []string{ + `"summaryContext":`, + `"managedBy":{"kind":"Deployment"`, + `"health":"healthy"`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 0 missing %s in %s", sub, b) + } + } + + // Row 1 — unhealthy pod with issueCount. + b, _ = json.Marshal(results[1]) + wantSubs = []string{ + `"health":"unhealthy"`, + `"issueCount":3`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 1 missing %s in %s", sub, b) + } + } +} + +// TestAttachResourceSummaryContextToList_MismatchedLengthsSilent — defensive +// path that protects against a future refactor where MinifyList might +// drop unsupported kinds. Attach must skip rather than panic. +func TestAttachResourceSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "api-1"}}, + } + results := []any{ + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-1"}, + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, + } + // Length mismatch (1 obj vs 2 results) — must not panic, must skip. + summarycontext.AttachToTypedList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + return &resourcecontext.ResourceSummaryContext{Health: "healthy"} + }) + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok { + t.Fatalf("row %d: unexpected type %T", i, row) + } + if summary.SummaryContext != nil { + t.Errorf("row %d: ResourceSummaryContext should be nil on length mismatch, got %#v", i, summary.SummaryContext) + } + } +} + +// TestAttachResourceSummaryContextToUnstructuredList covers the dynamic-CRD +// path. summarizeUnstructured returns *ResourceSummary so the attach +// helper is symmetric with the typed path. +func TestAttachResourceSummaryContextToUnstructuredList(t *testing.T) { + items := []*unstructured.Unstructured{ + {Object: map[string]any{ + "apiVersion": "argoproj.io/v1alpha1", + "kind": "Application", + "metadata": map[string]any{"name": "storefront", "namespace": "argocd"}, + "status": map[string]any{"conditions": []any{map[string]any{"type": "Ready", "status": "True"}}}, + }}, + } + want := map[string]*resourcecontext.ResourceSummaryContext{ + "argoproj.io|Application|argocd|storefront": { + Health: "healthy", + IssueCount: 1, + }, + } + + results := []any{aicontext.MinifyUnstructured(items[0], aicontext.LevelSummary)} + summarycontext.AttachToUnstructuredList(results, items, stubBuilder(t, want)) + + summary, ok := results[0].(*aicontext.ResourceSummary) + if !ok || summary == nil { + t.Fatalf("unexpected row type %T", results[0]) + } + if summary.SummaryContext == nil { + t.Fatalf("ResourceSummaryContext not attached") + } + if summary.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", summary.SummaryContext.Health) + } + if summary.SummaryContext.IssueCount != 1 { + t.Errorf("IssueCount = %d, want 1", summary.SummaryContext.IssueCount) + } +} + +// contains is a tiny strings.Contains alias kept local so the test file +// doesn't need a strings import alongside the existing imports. +func contains(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +} + +// TestIssueIndexNamespaces_ClusterScopedDropsFilter pins the fix for the +// "cluster-scoped issues filtered out for cluster-scoped rows" bug. +// Pre-fix, handleAIListResources passed the user's namespaced-access set +// straight into the issue index. For cluster-scoped kinds (Node, PV, +// cluster-scoped CRDs) every issue lives at namespace="" — the index +// then dropped them all, silently zeroing issueCount on every row even +// when the user had cluster-scoped read access. The helper now returns +// nil for cluster-scoped kinds so Compose runs cluster-wide. +func TestIssueIndexNamespaces_ClusterScopedDropsFilter(t *testing.T) { + userNs := []string{"prod", "staging"} + + // Cluster-scoped built-ins from the static catalogue (ClassifyKindScope + // hits ClusterOnlyKindGVR before touching discovery, so this works + // without a discovery client wired up). + clusterCases := []struct { + kind string + group string + }{ + {"Node", ""}, + {"nodes", ""}, + {"PersistentVolume", ""}, + {"ClusterRole", "rbac.authorization.k8s.io"}, + {"StorageClass", "storage.k8s.io"}, + } + for _, tc := range clusterCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if got != nil { + t.Errorf("issueIndexNamespaces(%q, %q) = %v, want nil — cluster-scoped kinds must not be namespace-filtered", + tc.kind, tc.group, got) + } + } + + // Namespaced kinds preserve the user's namespace set as-is so the + // scoping the per-user RBAC enforced upstream is honored. + namespacedCases := []struct { + kind string + group string + }{ + {"Pod", ""}, + {"Deployment", "apps"}, + {"ConfigMap", ""}, + } + for _, tc := range namespacedCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if len(got) != len(userNs) { + t.Errorf("issueIndexNamespaces(%q, %q) len = %d, want %d (namespace filter must pass through for namespaced kinds)", + tc.kind, tc.group, len(got), len(userNs)) + continue + } + for i := range got { + if got[i] != userNs[i] { + t.Errorf("issueIndexNamespaces(%q, %q)[%d] = %q, want %q", + tc.kind, tc.group, i, got[i], userNs[i]) + } + } + } + + // Pass-through when caller already provided nil (cluster-wide). + if got := issueIndexNamespaces(nil, "Pod", ""); got != nil { + t.Errorf("issueIndexNamespaces(nil, Pod) = %v, want nil", got) + } +} diff --git a/internal/summarycontext/attach.go b/internal/summarycontext/attach.go new file mode 100644 index 000000000..3f0ea18ce --- /dev/null +++ b/internal/summarycontext/attach.go @@ -0,0 +1,70 @@ +package summarycontext + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/k8s" +) + +// AttachToTypedList fills in SummaryContext for each *aicontext.ResourceSummary +// row produced from typed runtime.Object items (typed-cache list path). +// results and objs must be parallel slices — length mismatch is treated as a +// caller bug and the function returns without touching the rows. +// +// Group is sourced per-object from the typed object's GVK via SetTypeMeta + +// GetObjectKind, so list paths that mix kinds stay correct. +func AttachToTypedList(results []any, objs []runtime.Object, builder Builder) { + if len(results) != len(objs) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromObject(objs[i]) + summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// AttachToUnstructuredList is the dynamic-CRD counterpart of +// AttachToTypedList. Group comes from each item's apiVersion so two CRDs that +// share kind+ns+name across API groups (e.g. multiple operators each shipping +// a "Cluster" resource) get independent issue counts. +func AttachToUnstructuredList(results []any, items []*unstructured.Unstructured, builder Builder) { + if len(results) != len(items) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromUnstructured(items[i]) + summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// GroupFromObject extracts the API group from a typed runtime.Object's +// GroupVersionKind. Returns "" for core-group objects (Pod, Service, etc.) +// and when the GVK is unset. Calls k8s.SetTypeMeta so the GVK is populated +// from scheme metadata when the object came out of the typed cache without +// it set. +func GroupFromObject(obj runtime.Object) string { + if obj == nil { + return "" + } + k8s.SetTypeMeta(obj) + return obj.GetObjectKind().GroupVersionKind().Group +} + +// GroupFromUnstructured pulls the API group from an unstructured's apiVersion. +// Mirrors GroupFromObject for the dynamic-CRD path. +func GroupFromUnstructured(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + return u.GroupVersionKind().Group +} diff --git a/internal/summarycontext/summarycontext.go b/internal/summarycontext/summarycontext.go new file mode 100644 index 000000000..e375e6ab2 --- /dev/null +++ b/internal/summarycontext/summarycontext.go @@ -0,0 +1,225 @@ +// Package summarycontext is the shared core that powers the compact +// ResourceSummaryContext attached to /api/ai/resources/{kind} list rows, /api/search +// hits, and the MCP list_resources / search variants. +// +// The REST and MCP wrappers (internal/server, internal/mcp) differ only +// in their topology source — REST reads from a server-wide broadcaster +// cache; MCP memoizes per-process builds. Everything else (issue index, +// kind canonicalization, managedBy resolution, per-row dispatch by +// scope) is identical, so it lives here. +// +// pkg/resourcecontext intentionally has no dependencies on internal/* +// or pkg/topology; the join happens here. +package summarycontext + +import ( + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// Builder is the per-request closure that produces a ResourceSummaryContext for +// a single resource. nil result is fine — the ResourceSummaryContext field is +// omitempty on every consumer. +// +// group is required so the per-resource issue lookup can distinguish +// CRDs that share kind+namespace+name across API groups (e.g. Knative +// Service vs corev1 Service, or two custom CRDs both named "Cluster" +// from different operators). Pass "" for core-group resources. +type Builder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext + +// BuilderFromIndexes assembles the per-request closure. The list path +// passes the same index for both namespacedIdx and clusterIdx (single- +// kind list, scope already chosen by the caller); search passes two +// distinct indexes — namespacedIdx scoped to user namespaces, clusterIdx +// composed cluster-wide. The closure dispatches per-hit by scope so +// cluster-scoped hits read the cluster-wide index and surface +// namespace="" issues that the namespaced filter would otherwise drop. +// +// topo is the topology snapshot the caller has already obtained from +// its preferred source (REST: broadcaster cache; MCP: short-TTL +// memoizer). nil topo is fine — managedBy is omitted but issueCount +// still resolves. +func BuilderFromIndexes(topo *topology.Topology, namespacedIdx, clusterIdx IssueIndex) Builder { + resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) + dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + // One inverted-edges index per request — without it each + // GetRelationships call would re-scan topo.Edges in O(E), turning + // the list/search hot path into O(N × E). See pkg/topology T3. + var relIdx *topology.RelationshipsIndex + if topo != nil { + relIdx = topology.IndexByResource(topo) + } + + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + var managedBy *resourcecontext.ManagedByRef + if topo != nil { + // Pass the fetched object when available so synthesis is + // group-aware (avoids kind/plural collisions like Knative + // Service vs corev1 Service). Falls back to (kind, ns, name) + // lookup when neither obj nor u is set. + var rawObj any + switch { + case obj != nil: + rawObj = obj + case u != nil: + rawObj = u + } + rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) + managedBy = ManagedByFromRelationships(rel) + } + var source runtime.Object = obj + if source == nil && u != nil { + source = u + } + // Dispatch by scope: cluster-scoped hits read clusterIdx (composed + // at namespace=nil so namespace="" issues are present), namespaced + // hits read namespacedIdx (which honors the user's namespace + // filter so the per-row count doesn't pull in noise from + // namespaces the user can't see). + idx := namespacedIdx + if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { + idx = clusterIdx + } + return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ + ManagedBy: managedBy, + IssueCount: idx.Count(group, kind, namespace, name), + }) + } +} + +// IssueIndex keys per-resource issue counts as "group|kind|namespace|name". +// Group goes FIRST so two CRDs sharing kind+namespace+name across API +// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, +// or two operators each shipping a "Cluster" CRD) get independent counts +// instead of inheriting each other's. Kind is canonicalized via +// CanonicalSingular because issue sources emit the kind as-typed +// (Deployment) while callers may pass the URL plural (deployments) — +// canonicalization normalizes both. "|" can't appear in a Kubernetes API +// group (groups follow DNS subdomain rules), so it's a safe delimiter. +type IssueIndex map[string]int + +// Count returns the per-resource issue count, keyed by the group-aware +// composite key. Zero on miss. +func (i IssueIndex) Count(group, kind, namespace, name string) int { + return i[issueIndexKey(group, kind, namespace, name)] +} + +func issueIndexKey(group, kind, namespace, name string) string { + return group + "|" + strings.ToLower(CanonicalSingular(kind)) + "|" + namespace + "|" + name +} + +// CanonicalSingular collapses common plural forms back to the singular +// kind the issue engine emits. Cheap surface — only the kinds we +// actually scan in list_resources / search. +func CanonicalSingular(kind string) string { + k := strings.ToLower(kind) + switch k { + case "pods": + return "pod" + case "services": + return "service" + case "deployments": + return "deployment" + case "daemonsets": + return "daemonset" + case "statefulsets": + return "statefulset" + case "replicasets": + return "replicaset" + case "jobs": + return "job" + case "cronjobs": + return "cronjob" + case "ingresses": + return "ingress" + case "configmaps": + return "configmap" + case "secrets": + return "secret" + case "persistentvolumeclaims": + return "persistentvolumeclaim" + case "persistentvolumes": + return "persistentvolume" + case "storageclasses": + return "storageclass" + case "horizontalpodautoscalers", "hpas", "hpa": + return "horizontalpodautoscaler" + case "poddisruptionbudgets": + return "poddisruptionbudget" + case "nodes": + return "node" + case "namespaces": + return "namespace" + case "events": + return "event" + } + return k +} + +// BuildIssueIndex composes the per-request issue index. NoLimit (not +// MaxLimit) is required here: a 5000-issue cluster would otherwise +// truncate after the first 1000 sorted rows, silently zeroing +// issueCount for resources whose issues fall in the tail. We're +// bucketing for a per-resource lookup, not paginating — the caller of +// the builder never sees the issue list itself. +// +// We rely on Filters.IncludeAudit and Filters.IncludeEvents staying +// false-by-default — that's what keeps the per-row count to "problem" +// + "condition" only. Audit + Warning events are loud and require +// explicit opt-in; rolling them into the per-row count would distort +// "this Pod has 1 issue" for the common case. +// +// No Kinds filter on Compose: the index buckets every composed row by +// (group, kind, ns, name), and the per-row lookup keys off +// issueIndexKey(...) with the same canonicalization, so kind-mismatched +// rows simply never read. Filtering Compose itself by Kind would need +// CRD-plural awareness — CanonicalSingular handles built-ins but +// returns CRD plurals (e.g. "applications") unchanged, and the issue +// engine emits "Application", silently zeroing issueCount on every CRD +// row. Bucketing is O(N) over the at-most-namespace-bounded issue set, +// which the consumer materialises anyway. +func BuildIssueIndex(p issues.Provider, namespaces []string) IssueIndex { + filters := issues.Filters{ + Namespaces: namespaces, + Limit: issues.NoLimit, + } + composed := issues.Compose(p, filters) + idx := make(IssueIndex, len(composed)) + for _, iss := range composed { + idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ + } + return idx +} + +// ManagedByFromRelationships extracts a compact ManagedByRef from +// computed topology relationships. Preference order: +// 1. Relationships.ManagedBy[0] — the server-synthesized topmost +// manager (ArgoCD Application > Flux Kustomization/HelmRelease > +// Helm release > topmost K8s owner). Walks the owner chain past +// ReplicaSets to the controlling Deployment in one shot. +// 2. Direct Owner — fallback for shapes ManagedBy synthesis declines +// (e.g. cluster-scoped roots where the topmost manager is the +// resource itself). +// +// Returns nil when topology has no relationship for the resource. +func ManagedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { + if rel == nil { + return nil + } + if len(rel.ManagedBy) > 0 { + ref := rel.ManagedBy[0] + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + } + if rel.Owner != nil { + return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) + } + return nil +} diff --git a/internal/summarycontext/summarycontext_test.go b/internal/summarycontext/summarycontext_test.go new file mode 100644 index 000000000..f032e1f52 --- /dev/null +++ b/internal/summarycontext/summarycontext_test.go @@ -0,0 +1,376 @@ +// Pure-function tests for the shared summarycontext core. The +// REST/MCP-specific wiring tests (attachSummaryContextToList, +// dispatch-on-CanReadClusterScoped, the ai-handler issueIndexNamespaces +// helper) stay at their respective handler sites in internal/server +// and internal/mcp. + +package summarycontext + +import ( + "fmt" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bp "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// fakeIssuesProvider is a minimal issues.Provider for the BuildIssueIndex +// tests. Only the fields the index path touches are wired. +// +// DetectProblems mirrors CacheProvider.DetectProblems: empty namespaces +// returns the full set; a non-empty slice drops cluster-scoped rows +// (Namespace=="") to match the production flattenNamespacedProblems +// behavior — needed so the cluster-scoped-filter regression test can +// pin the actual bug. +type fakeIssuesProvider struct { + problems []k8s.Problem +} + +func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { + if len(namespaces) == 0 { + return f.problems + } + allowed := map[string]bool{} + for _, ns := range namespaces { + allowed[ns] = true + } + out := make([]k8s.Problem, 0, len(f.problems)) + for _, p := range f.problems { + if p.Namespace == "" { + continue + } + if allowed[p.Namespace] { + out = append(out, p) + } + } + return out +} +func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } +func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } +func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { + return nil +} +func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } +func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { + return nil, nil +} +func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } +func (f *fakeIssuesProvider) KyvernoFindings() []policyreports.SubjectFindings { return nil } +func (f *fakeIssuesProvider) KyvernoStatus() string { return "" } + +func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } + +// TestIssueIndexKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups get independent +// counts. Without group in the key, e.g. Knative serving.knative.dev/ +// Service vs corev1 ""/Service collapse onto one bucket — and either +// the CRD inherits the core Service's count or vice versa. This breaks +// the moment a user has two operators each shipping a kind named +// "Cluster" in the same namespace. +func TestIssueIndexKey_GroupAware(t *testing.T) { + idx := IssueIndex{} + // Same kind+ns+name, different groups — must be independent buckets. + idx[issueIndexKey("", "Service", "prod", "api")] = 2 + idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 + + if got := idx.Count("", "Service", "prod", "api"); got != 2 { + t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 5 { + t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) + } + // Wrong group lookup is a miss, not a fallback. + if got := idx.Count("example.io", "Service", "prod", "api"); got != 0 { + t.Errorf("unknown-group lookup = %d, want 0 (key should not coalesce across groups)", got) + } +} + +// TestBuildIssueIndex_GroupAware exercises the full BuildIssueIndex +// path with two CRDs that share kind+namespace+name but live in +// different API groups. Pre-fix, both rows landed under the same +// "service|prod|api" key and one inherited the other's count. +func TestBuildIssueIndex_GroupAware(t *testing.T) { + // Inject via a fake issues.Provider rather than the cache plumbing — + // keeps the test focused on the index-key arithmetic. + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, + }, + } + idx := BuildIssueIndex(p, nil) + if got := idx.Count("", "Service", "prod", "api"); got != 1 { + t.Errorf("core Service count = %d, want 1", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 2 { + t.Errorf("Knative Service count = %d, want 2", got) + } +} + +// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues +// would fall in the tail beyond MaxLimit still get correct issueCounts. +// Pre-fix, BuildIssueIndex passed Limit:MaxLimit (1000) to Compose; on +// a cluster with >1000 issues the post-sort truncation silently zeroed +// out counts for tail resources. The fix is Limit:NoLimit — the index +// is a bucketed count, not a paginated list. +func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { + probs := make([]k8s.Problem, 0, issues.MaxLimit+50) + for i := 0; i < issues.MaxLimit+50; i++ { + probs = append(probs, k8s.Problem{ + Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", + }) + } + p := &fakeIssuesProvider{problems: probs} + idx := BuildIssueIndex(p, nil) + tailName := fmtPodName(issues.MaxLimit + 25) + if got := idx.Count("", "Pod", "prod", tailName); got != 1 { + t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) + } + if got := idx.Count("", "Pod", "prod", fmtPodName(0)); got != 1 { + t.Errorf("head pod count = %d, want 1", got) + } +} + +// TestCanonicalSingular pins the kind normalization used to align URL +// plurals with the singular form the issue engine emits. +func TestCanonicalSingular(t *testing.T) { + cases := map[string]string{ + "pods": "pod", + "Pods": "pod", + "Deployment": "deployment", + "deployments": "deployment", + "hpa": "horizontalpodautoscaler", + "unknownkind": "unknownkind", + } + for in, want := range cases { + if got := CanonicalSingular(in); got != want { + t.Errorf("CanonicalSingular(%q) = %q, want %q", in, got, want) + } + } +} + +// TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the +// end-to-end behavior: when the builder passes nil for the namespace +// filter (cluster-scoped kind), node-level issues at namespace="" +// surface in the index and the per-resource lookup returns the correct +// count. With a namespace filter populated, those same issues are +// dropped because Compose's per-namespace problem walk never sees them. +func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + // Cluster-scoped Node issue: namespace="" — the actual shape + // k8s.DetectProblems emits for NodeNotReady / DiskPressure etc. + {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + }, + } + + // Cluster-wide compose (nil namespaces) — issue surfaces. + idx := BuildIssueIndex(p, nil) + if got := idx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) + } + + // Namespace-scoped compose — same issue, but ns filter to + // ["prod","staging"] drops it because the user-namespaced perm + // slice never matches "". This is what the pre-fix handler did for + // Node lists. + scopedIdx := BuildIssueIndex(p, []string{"prod", "staging"}) + if got := scopedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) + } +} + +// TestBuildIssueIndex_CRDPlural_NonZeroCount pins the fix for a Bugbot +// finding on PR #722: a CRD listed by its plural form (e.g. +// "applications" for ArgoCD Application) silently returned +// issueCount=0 because BuildIssueIndex used to push the URL kind +// through CanonicalSingular into filters.Kinds. CanonicalSingular only +// covers built-in plurals — CRD plurals fell through unchanged +// ("applications" stayed "applications"), Compose's case-insensitive +// Kind filter then failed against the singular "Application" the +// issue engine emits, and every CRD row's count was zero. We dropped +// the Kinds filter entirely: bucketing by issueIndexKey(group, kind, +// ns, name) is already correct because the lookup side runs through +// CanonicalSingular too. Per-resource lookup uses the row's singular +// Kind (Pascal "Application") so the index and the query agree. +func TestBuildIssueIndex_CRDPlural_NonZeroCount(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Application", Group: "argoproj.io", Namespace: "argocd", Name: "storefront", Reason: "SyncFailed", Severity: "critical"}, + }, + } + + // Pre-fix simulation: the handler would have passed kindFilter="applications" + // — the URL plural. We no longer take a kindFilter, but verify that + // the index contains the row keyed by the canonical singular form. + idx := BuildIssueIndex(p, []string{"argocd"}) + if got := idx.Count("argoproj.io", "Application", "argocd", "storefront"); got != 1 { + t.Errorf("CRD Application count (singular kind) = %d, want 1", got) + } + // Also pin the URL-form lookup path: the per-row Builder is called + // with the kind as returned by MinifyUnstructured, which for CRDs + // is the singular ("Application"). If a caller ever pushed the + // plural ("applications") through Count(), CanonicalSingular won't + // normalize unknown CRD plurals — that's a separate latent issue + // that doesn't manifest today because the row source uses the + // singular. Document the asymmetry explicitly. + if got := idx.Count("argoproj.io", "applications", "argocd", "storefront"); got != 0 { + t.Errorf("CRD lookup via plural = %d, want 0 (CanonicalSingular only normalizes built-ins; row source uses singular Kind, so lookup matches via singular path)", got) + } +} + +// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end +// shape used by /api/search and MCP search: scanNamespaces is non-nil +// (a namespace-restricted user, or a user with a `ns:` query modifier), +// so the constructor must compose TWO issue indexes — one scoped to +// those namespaces, one cluster-wide for cluster-scoped hits. Without +// the second index, the Node hit's summaryContext.issueCount returns +// 0 because every Node issue lives at namespace="" and the namespace +// filter drops them. +func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, + }, + } + + // Build the two indexes the search constructor would build. + namespacedIdx := BuildIssueIndex(p, []string{"prod"}) + clusterIdx := BuildIssueIndex(p, nil) + + // Sanity: pre-fix, the search handler passed namespacedIdx for + // both; Node issueCount silently zeroed. + if got := namespacedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespacedIdx Node count = %d, want 0 (sanity — namespace filter drops cluster-scoped issues)", got) + } + if got := clusterIdx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("clusterIdx Node count = %d, want 1 (cluster-wide compose surfaces namespace=\"\" issues)", got) + } + if got := namespacedIdx.Count("", "Pod", "prod", "api-7"); got != 1 { + t.Errorf("namespacedIdx Pod count = %d, want 1", got) + } + + // With both indexes built, the closure dispatches per-hit by + // scope. Replay the dispatch via the shared helper to pin the + // end-to-end shape. Topology is nil; managedBy is nil but + // issueCount dispatch is what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) + } +} + +// TestBuilderFromIndexes_DispatchesByScope pins the dual-index dispatch: +// cluster-scoped hits (Node, PV, …) read the cluster-wide index (where +// namespace="" issues live), namespaced hits (Pod, Deployment, …) read +// the namespace-scoped index. Without this dispatch, a search response +// that mixes Pods and Nodes silently zeros issueCount on the Node hits +// — the namespace-scoped index drops every namespace="" issue. +// +// A wiring inversion (cluster-scoped → namespaced index) would +// re-introduce the bug, so we additionally assert no cross-bucket leak. +func TestBuilderFromIndexes_DispatchesByScope(t *testing.T) { + // Build two distinct indexes so we can tell which one was consulted. + namespacedIdx := IssueIndex{} + namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 + + clusterIdx := IssueIndex{} + clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 + + // Topology is nil — managedBy is nil but issueCount dispatch is + // what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + + // Cluster-scoped Node hit — must read clusterIdx. + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { + t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) + } + // Namespaced Pod hit — must read namespacedIdx. + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { + t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) + } + // A cluster-scoped hit whose name only lives in the namespaced + // index must return 0 (no cross-bucket leak). + if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) + } + // And a namespaced hit whose name only lives in the cluster index + // likewise returns 0. + if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) + } +} + +// TestManagedByFromRelationships_PrefersManagedBy pins the topmost-manager +// shortcut: when topology has synthesized a ManagedBy chain (Pod → +// ReplicaSet → Deployment), the helper surfaces the Deployment, not the +// noisy hash-suffixed ReplicaSet that sits in Owner. +func TestManagedByFromRelationships_PrefersManagedBy(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + }, + } + got := ManagedByFromRelationships(rel) + want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} + if got == nil || got.Kind != want.Kind || got.Name != want.Name || got.Namespace != want.Namespace || got.Source != want.Source { + t.Errorf("got %#v, want %#v", got, want) + } +} + +// TestManagedByFromRelationships_FallsBackToOwner covers the case where +// topology synthesis declined ManagedBy (e.g. cluster-scoped roots) — +// we still surface the direct Owner so the row isn't context-less. +func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + } + got := ManagedByFromRelationships(rel) + if got == nil { + t.Fatalf("got nil, want Application ref") + } + if got.Source != "argocd" { + t.Errorf("Source = %q, want argocd", got.Source) + } +} + +// TestManagedByFromRelationships_ManagedByWinsOverOwner pins that when +// both ManagedBy and Owner are set, ManagedBy[0] takes precedence — the +// server-synthesized topmost-manager walk should never be shadowed by +// the direct owner ref left over for back-compat. +func TestManagedByFromRelationships_ManagedByWinsOverOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + }, + } + got := ManagedByFromRelationships(rel) + if got == nil || got.Kind != "Application" || got.Source != "argocd" { + t.Errorf("got %#v, want Application/argocd", got) + } +} + +func TestManagedByFromRelationships_NilSafe(t *testing.T) { + if got := ManagedByFromRelationships(nil); got != nil { + t.Errorf("nil rel: got %#v, want nil", got) + } + if got := ManagedByFromRelationships(&topology.Relationships{}); got != nil { + t.Errorf("empty rel: got %#v, want nil", got) + } +} diff --git a/pkg/ai/context/logs.go b/pkg/ai/context/logs.go index 50e09e82d..cd0b7192c 100644 --- a/pkg/ai/context/logs.go +++ b/pkg/ai/context/logs.go @@ -74,6 +74,26 @@ func FilterLogs(rawLogs string) FilteredLogs { } } +// FilterLogsByPattern first keeps only lines matching pattern, then applies +// the usual diagnostic log filtering. This gives agents a server-side +// equivalent of `kubectl logs ... | grep PATTERN | tail`. +func FilterLogsByPattern(rawLogs, pattern string) (FilteredLogs, error) { + if strings.TrimSpace(pattern) == "" { + return FilterLogs(rawLogs), nil + } + re, err := regexp.Compile(pattern) + if err != nil { + return FilteredLogs{}, err + } + var matched []string + for _, line := range strings.Split(strings.TrimRight(rawLogs, "\n"), "\n") { + if re.MatchString(line) { + matched = append(matched, line) + } + } + return FilterLogs(strings.Join(matched, "\n")), nil +} + // deduplicateStackTraces collapses identical consecutive lines with a repeat count. func deduplicateStackTraces(lines []string) []string { if len(lines) == 0 { diff --git a/pkg/ai/context/logs_test.go b/pkg/ai/context/logs_test.go index c129efd28..f2a366a53 100644 --- a/pkg/ai/context/logs_test.go +++ b/pkg/ai/context/logs_test.go @@ -180,3 +180,30 @@ func TestFilterLogs_RedactsSecrets(t *testing.T) { t.Errorf("Secret not redacted in log line: %s", result.Lines[0]) } } + +func TestFilterLogsByPattern_FiltersBeforeSummary(t *testing.T) { + lines := []string{ + "INFO checkout request ok", + "INFO cart request slow", + "INFO recommendation request ok", + } + input := strings.Join(lines, "\n") + + result, err := FilterLogsByPattern(input, "cart") + if err != nil { + t.Fatalf("FilterLogsByPattern returned error: %v", err) + } + if result.TotalLines != 1 { + t.Errorf("Expected TotalLines=1 after grep, got %d", result.TotalLines) + } + if len(result.Lines) != 1 || !strings.Contains(result.Lines[0], "cart request slow") { + t.Fatalf("Expected cart line, got %#v", result.Lines) + } +} + +func TestFilterLogsByPattern_InvalidRegex(t *testing.T) { + _, err := FilterLogsByPattern("INFO ok", "[") + if err == nil { + t.Fatal("Expected invalid regex error") + } +} diff --git a/pkg/ai/context/summary.go b/pkg/ai/context/summary.go index 47ece9d7f..fd27d4c6f 100644 --- a/pkg/ai/context/summary.go +++ b/pkg/ai/context/summary.go @@ -14,6 +14,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/pkg/resourcecontext" ) // ResourceSummary is the typed output for Summary-level minification. @@ -40,34 +42,42 @@ type ResourceSummary struct { Finalizers []string `json:"finalizers,omitempty"` // Type-specific fields (only populated when relevant) - Image string `json:"image,omitempty"` - Ports string `json:"ports,omitempty"` - Schedule string `json:"schedule,omitempty"` - Type string `json:"type,omitempty"` // Service type, Secret type - Selector string `json:"selector,omitempty"` - ClusterIP string `json:"clusterIP,omitempty"` - Hosts []string `json:"hosts,omitempty"` - Restarts int32 `json:"restarts,omitempty"` - Node string `json:"node,omitempty"` - Strategy string `json:"strategy,omitempty"` - Completions string `json:"completions,omitempty"` - Duration string `json:"duration,omitempty"` + Image string `json:"image,omitempty"` + Ports string `json:"ports,omitempty"` + Schedule string `json:"schedule,omitempty"` + Type string `json:"type,omitempty"` // Service type, Secret type + Selector string `json:"selector,omitempty"` + ClusterIP string `json:"clusterIP,omitempty"` + Hosts []string `json:"hosts,omitempty"` + Restarts int32 `json:"restarts,omitempty"` + Node string `json:"node,omitempty"` + Strategy string `json:"strategy,omitempty"` + Completions string `json:"completions,omitempty"` + Duration string `json:"duration,omitempty"` Suspended *bool `json:"suspended,omitempty"` Unschedulable *bool `json:"unschedulable,omitempty"` - Active int `json:"active,omitempty"` - Target string `json:"target,omitempty"` - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas int32 `json:"maxReplicas,omitempty"` - Current int32 `json:"current,omitempty"` - Desired int32 `json:"desired,omitempty"` - Roles []string `json:"roles,omitempty"` - Version string `json:"version,omitempty"` - Pressures []string `json:"pressures,omitempty"` - Keys []string `json:"keys,omitempty"` - StorageClass string `json:"storageClass,omitempty"` - Capacity string `json:"capacity,omitempty"` - AccessModes []string `json:"accessModes,omitempty"` - Owner string `json:"owner,omitempty"` + Active int `json:"active,omitempty"` + Target string `json:"target,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas int32 `json:"maxReplicas,omitempty"` + Current int32 `json:"current,omitempty"` + Desired int32 `json:"desired,omitempty"` + Roles []string `json:"roles,omitempty"` + Version string `json:"version,omitempty"` + Pressures []string `json:"pressures,omitempty"` + Keys []string `json:"keys,omitempty"` + StorageClass string `json:"storageClass,omitempty"` + Capacity string `json:"capacity,omitempty"` + AccessModes []string `json:"accessModes,omitempty"` + Owner string `json:"owner,omitempty"` + + // SummaryContext is the per-row enrichment attached by AI-facing list + // surfaces (REST /api/ai/resources/{kind}, MCP list_resources, search + // hits). Populated by handlers post-minify via resourcecontext.BuildSummary; + // nil when the caller opted out (?context=none) or when no fields apply. + // Type is resourcecontext.ResourceSummaryContext — the field name keeps + // the shorter "SummaryContext" form to match the wire JSON tag. + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // summarize dispatches to the appropriate per-type extractor and then diff --git a/pkg/audit/checks.go b/pkg/audit/checks.go index 1e19dacb9..c61120190 100644 --- a/pkg/audit/checks.go +++ b/pkg/audit/checks.go @@ -935,6 +935,15 @@ func buildResults(findings []Finding) *ScanResults { categories[cat] = CategorySummary{} } + // Populate Group from the built-in (Kind→Group) table. Check emission + // sites leave Group="" so the per-check code stays terse — single + // point of truth here instead of every Finding{} literal. + for i := range findings { + if findings[i].Group == "" { + findings[i].Group = GroupForBuiltinKind(findings[i].Kind) + } + } + // Merge findings: same (resource, checkID) get combined into one finding // with messages joined, so multi-container workloads show all affected containers. type checkKey struct{ resource, checkID string } @@ -942,7 +951,7 @@ func buildResults(findings []Finding) *ScanResults { var dedupFindings []Finding for _, f := range findings { - key := checkKey{ResourceKey(f.Kind, f.Namespace, f.Name), f.CheckID} + key := checkKey{ResourceKey(f.Group, f.Kind, f.Namespace, f.Name), f.CheckID} if idx, exists := mergeIndex[key]; exists { dedupFindings[idx].Message += "; " + f.Message continue diff --git a/pkg/audit/helpers.go b/pkg/audit/helpers.go index aee6ebea1..681a5eaa2 100644 --- a/pkg/audit/helpers.go +++ b/pkg/audit/helpers.go @@ -6,19 +6,23 @@ import ( "strings" ) -// ResourceKey returns the index key for a resource: "Kind/namespace/name". -func ResourceKey(kind, namespace, name string) string { - if namespace == "" { - return fmt.Sprintf("%s//%s", kind, name) - } - return fmt.Sprintf("%s/%s/%s", kind, namespace, name) +// ResourceKey returns the index key for a resource: +// "group|Kind|namespace|name". Group goes first because both group and +// namespace can legitimately be empty independently — encoding group +// last would leave a cluster-scoped CRD key ambiguous with a +// namespaced core-group key under any 3-part parse. "|" is a safe +// delimiter — Kubernetes API groups follow DNS subdomain rules and +// can't contain it. Mirrors the same shape as the issue-source key in +// internal/summarycontext. +func ResourceKey(group, kind, namespace, name string) string { + return fmt.Sprintf("%s|%s|%s|%s", group, kind, namespace, name) } // IndexByResource builds a lookup map from ResourceKey → []Finding. func IndexByResource(findings []Finding) map[string][]Finding { m := make(map[string][]Finding) for _, f := range findings { - key := ResourceKey(f.Kind, f.Namespace, f.Name) + key := ResourceKey(f.Group, f.Kind, f.Namespace, f.Name) m[key] = append(m[key], f) } return m @@ -33,6 +37,7 @@ func GroupByResource(findings []Finding) []ResourceGroup { for _, fs := range index { g := ResourceGroup{ Kind: fs[0].Kind, + Group: fs[0].Group, Namespace: fs[0].Namespace, Name: fs[0].Name, Findings: fs, @@ -55,13 +60,46 @@ func GroupByResource(findings []Finding) []ResourceGroup { if groups[i].Warning != groups[j].Warning { return groups[i].Warning > groups[j].Warning } - return ResourceKey(groups[i].Kind, groups[i].Namespace, groups[i].Name) < - ResourceKey(groups[j].Kind, groups[j].Namespace, groups[j].Name) + return ResourceKey(groups[i].Group, groups[i].Kind, groups[i].Namespace, groups[i].Name) < + ResourceKey(groups[j].Group, groups[j].Kind, groups[j].Namespace, groups[j].Name) }) return groups } +// GroupForBuiltinKind maps a built-in Kubernetes Kind to the API group +// the audit suite scans it under. Returns "" for kinds the suite +// doesn't recognize — those don't get a populated Finding.Group, which +// means cross-group collision risk is bounded to the listed built-ins +// vs. third-party CRDs sharing the same Kind name. +// +// Kept here (next to ResourceKey) so the Kind→Group mapping lives in +// one place rather than every Finding{} emission site. buildResults +// populates Finding.Group via this helper before the index is built; +// per-check code stays terse and group-agnostic. +// +// Also reused by internal/issues to resolve Group on Problem-sourced +// rows that pre-date group-aware emission — keeps the (Kind→Group) +// table in one place across packages. +func GroupForBuiltinKind(kind string) string { + switch kind { + case "Pod", "Service", "ConfigMap", "Secret", "Node", "Namespace", + "PersistentVolume", "PersistentVolumeClaim", "ServiceAccount": + return "" + case "Deployment", "DaemonSet", "StatefulSet", "ReplicaSet": + return "apps" + case "Job", "CronJob": + return "batch" + case "HorizontalPodAutoscaler": + return "autoscaling" + case "Ingress", "NetworkPolicy": + return "networking.k8s.io" + case "PodDisruptionBudget": + return "policy" + } + return "" +} + // ApplySettings filters audit results based on ignored namespaces (with wildcard // patterns like *-system) and disabled checks. This is the shared implementation // used by all consumers (HTTP handlers, MCP, skyhook-connector). diff --git a/pkg/audit/helpers_test.go b/pkg/audit/helpers_test.go new file mode 100644 index 000000000..7ee965f71 --- /dev/null +++ b/pkg/audit/helpers_test.go @@ -0,0 +1,71 @@ +package audit + +import "testing" + +// TestResourceKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups produce distinct +// keys. Pre-fix, ResourceKey was group-blind: a Knative +// serving.knative.dev/Service "api" in "prod" collided with the core +// "" /Service "api" in "prod", and IndexByResource would conflate +// their findings (whichever Finding came last would shadow the other +// in the dedup checkKey, and any lookup by ResourceKey returned the +// pooled set). The fix routes Group through the key. +func TestResourceKey_GroupAware(t *testing.T) { + core := ResourceKey("", "Service", "prod", "api") + knative := ResourceKey("serving.knative.dev", "Service", "prod", "api") + if core == knative { + t.Fatalf("ResourceKey collides across groups: %q == %q", core, knative) + } +} + +// TestIndexByResource_NoCrossGroupCollision exercises the same fix +// end-to-end: emit two Findings for kind/ns/name "Service/prod/api", +// one with Group="" (core) and one with Group="serving.knative.dev" +// (Knative), and verify each lookup returns ONLY its own finding — +// not the union. +func TestIndexByResource_NoCrossGroupCollision(t *testing.T) { + findings := []Finding{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", CheckID: "core-finding"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", CheckID: "knative-finding"}, + } + idx := IndexByResource(findings) + + core := idx[ResourceKey("", "Service", "prod", "api")] + if len(core) != 1 || core[0].CheckID != "core-finding" { + t.Errorf("core lookup: got %+v, want 1 finding with CheckID=core-finding", core) + } + knative := idx[ResourceKey("serving.knative.dev", "Service", "prod", "api")] + if len(knative) != 1 || knative[0].CheckID != "knative-finding" { + t.Errorf("knative lookup: got %+v, want 1 finding with CheckID=knative-finding", knative) + } +} + +// TestGroupForBuiltinKind pins the (Kind→Group) table used by +// buildResults to populate Finding.Group for emission sites that leave +// it empty. Centralising the table here keeps per-check code terse; +// drift between this table and the actual API group a check scans +// would silently mis-key findings. +func TestGroupForBuiltinKind(t *testing.T) { + cases := map[string]string{ + "Pod": "", + "Service": "", + "ConfigMap": "", + "Secret": "", + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "ReplicaSet": "apps", + "Job": "batch", + "CronJob": "batch", + "HorizontalPodAutoscaler": "autoscaling", + "Ingress": "networking.k8s.io", + "NetworkPolicy": "networking.k8s.io", + "PodDisruptionBudget": "policy", + "UnknownCRD": "", + } + for kind, want := range cases { + if got := GroupForBuiltinKind(kind); got != want { + t.Errorf("GroupForBuiltinKind(%q) = %q, want %q", kind, got, want) + } + } +} diff --git a/pkg/audit/types.go b/pkg/audit/types.go index a5b8f91f4..4d7d63da3 100644 --- a/pkg/audit/types.go +++ b/pkg/audit/types.go @@ -63,8 +63,12 @@ type ScanResults struct { // ResourceGroup aggregates findings for a single resource. // Groups are sorted by severity (danger first), then by name. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. type ResourceGroup struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` Warning int `json:"warning"` @@ -88,8 +92,14 @@ type CategorySummary struct { } // Finding represents a single best-practice violation. +// Group disambiguates kinds that collide across API groups +// (e.g. core/Service vs serving.knative.dev/Service); empty for the +// core API group. Check emission sites leave Group="" — buildResults +// populates it via groupForBuiltinKind so the (Kind→Group) map lives +// in one place rather than every check function. type Finding struct { Kind string `json:"kind"` + Group string `json:"group,omitempty"` Namespace string `json:"namespace"` Name string `json:"name"` CheckID string `json:"checkID"` diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go new file mode 100644 index 000000000..856f0dbe4 --- /dev/null +++ b/pkg/resourcecontext/build.go @@ -0,0 +1,1227 @@ +package resourcecontext + +import ( + "context" + "sort" + "strings" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + rbacv1 "k8s.io/api/rbac/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// Options carries everything Build needs to compute a ResourceContext. +// +// Per the v1 contract, this package depends only on pkg/topology — callers +// in internal/* pre-compute IssueSummary / AuditSummary / PolicyReports and +// pass them in, so we don't reach into internal/issues or internal/audit. +type Options struct { + Tier ContextTier + + // AccessChecker gates every emitted ContextRef. nil = no gating (treat + // as fully authorized — local-kubeconfig / tests). + AccessChecker RefAccessChecker + + // Topology data sources. When Topology is nil, the topology-derived + // fields (Exposes, SelectedBy, ScaledBy, ManagedBy, RunsOn, + // Uses.ServiceAccount) are skipped. + Topology *topology.Topology + Provider topology.ResourceProvider + DynamicProv topology.DynamicProvider + + // Relationships is the pre-computed per-resource projection. When non-nil, + // Build consumes it directly instead of calling + // topology.GetRelationshipsWithObject — single-resource handlers should + // leave this nil and let Build compute; bulk/list callers that already + // loop over relationships per row SHOULD pass it to avoid double work. + // + // Topology MUST still be set when Relationships is set — synthesis + // helpers (e.g. ManagedBy owner walk) read Topology and RelIndex through + // it. + Relationships *topology.Relationships + + // RelIndex is the topology inverted-edge index. Pass a shared instance + // (topology.IndexByResource(topo)) for high-fanout callers; nil is fine + // for single-resource Build paths — the per-call inline scan is O(E) once. + RelIndex *topology.RelationshipsIndex + + // Pre-computed summaries — pass-through into the response. + IssueSummary *IssueSummary + AuditSummary *AuditSummary + PolicyReports PolicyReportLookup // nil = Kyverno not installed / no findings + + // Optional kind-specific lookups. ServiceBackends is used only for + // Service resources to attach realized pod-selection state. The raw + // Service spec already carries selector/ports; this lookup answers + // whether that selector currently resolves to ready Pods. + ServiceBackends ServiceBackendLookup +} + +// PolicyReportLookup is the minimal interface Build needs from the +// PolicyReport index. The concrete index lives in pkg/policyreports. +// +// Build does not import pkg/policyreports directly because callers may +// adapt other policy engines into the same shape. +type PolicyReportLookup interface { + FindingsFor(group, kind, namespace, name string) []KyvernoFinding +} + +type ServiceBackendLookup interface { + PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) +} + +// RefAccessChecker abstracts the RBAC check so this package doesn't import +// any internal/* package. REST and MCP handlers each implement this with a +// request-scoped batch cache (see internal/server/rc_rbac.go). +// +// Implementations should treat (group, kind, namespace) as the cache key — +// per-name SAR has no upside since RBAC is namespace-granular. +type RefAccessChecker interface { + CanRead(ctx context.Context, group, kind, namespace string) bool +} + +// Build produces a ResourceContext for obj at the requested tier. +// +// Returns nil when obj is nil. Returns a zero-value (.Tier-only) +// ResourceContext when obj is recognized but no enrichment fields apply. +// Never panics on nil sub-fields of opts. +func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceContext { + if obj == nil { + return nil + } + + ident, ok := identityOf(obj) + if !ok { + return &ResourceContext{Tier: opts.Tier} + } + + rc := &ResourceContext{Tier: opts.Tier} + omitted := newOmittedTracker() + + // Topology-derived relationships drive ManagedBy / Exposes / SelectedBy / + // ScaledBy / RunsOn / Uses.ServiceAccount. T23 made + // topology.Relationships the canonical projection: server-side + // SynthesizeManagedBy walks the owner chain + GitOps signals, and the Pod + // hygiene fields (.ServiceAccount, .Node) are populated from pod.Spec. + // ManagedBy stays delegated to topology to avoid duplicating its owner-chain + // and GitOps logic. The direct Owner field below may still fall back to the + // object's controller OwnerReference when topology is absent or cold. + // + // Single-resource callers (REST GET, MCP get_resource) leave + // opts.Relationships nil and let us compute via GetRelationshipsWithObject + // — passing obj keeps kind/group disambiguation correct for CRDs whose + // plural collides with a core resource. Bulk callers that already loop + // over relationships per row pass them in directly. + rel := opts.Relationships + if rel == nil && opts.Topology != nil { + // Resolve the topology-pseudo-kind so cross-group CRDs (Knative + // serving.knative.dev/Service, CAPI cluster.x-k8s.io/Cluster, …) + // look up the right node. Using ident.Kind directly would lower- + // case to "service" and resolve to the core Service node, leaking + // the wrong resource's relationships into the CRD's resourceContext. + // The handler-side pre-computation does this same KindForGVK + // resolution; mirror it here so the fallback path doesn't undo it. + rel = topology.GetRelationshipsWithObject( + topology.KindForGVK(ident.Kind, ident.Group), ident.Namespace, ident.Name, obj, + opts.Topology, opts.Provider, opts.DynamicProv, opts.RelIndex, + ) + } + + // 1. ManagedBy — prefer Relationships.ManagedBy (server-synthesized when + // a topology is available; covers GitOps signals + owner-chain walk). + // Fall back to topology.SynthesizeManagedBy with the obj alone when no + // topology is provided — that path still detects Argo/Flux/Helm signals + // from labels and annotations without needing a graph. + var managedBy []topology.ResourceRef + if rel != nil && len(rel.ManagedBy) > 0 { + managedBy = rel.ManagedBy + } else if rel == nil { + if m, ok := obj.(metav1.Object); ok { + managedBy = topology.SynthesizeManagedBy(m, ident.Kind, ident.Namespace, ident.Name, nil, nil, nil) + } + } + if len(managedBy) > 0 { + rc.ManagedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(managedBy), + "managedBy", omitted) + } + + if rel != nil && rel.Owner != nil { + candidate := &ContextRef{ + Kind: rel.Owner.Kind, + Group: rel.Owner.Group, + Namespace: rel.Owner.Namespace, + Name: rel.Owner.Name, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.Owner = candidate + } else { + omitted.add("owner", OmittedRBACDenied) + } + } else if owner := ownerFromObject(obj, ident.Namespace); owner != nil { + if checkRef(ctx, opts.AccessChecker, owner) { + rc.Owner = owner + } else { + omitted.add("owner", OmittedRBACDenied) + } + } + + // 2. Topology-derived: Exposes, SelectedBy, ScaledBy + if rel != nil { + exposes := make([]topology.ResourceRef, 0, len(rel.Services)+len(rel.Ingresses)+len(rel.Gateways)+len(rel.Routes)) + exposes = append(exposes, rel.Services...) + exposes = append(exposes, rel.Ingresses...) + exposes = append(exposes, rel.Gateways...) + exposes = append(exposes, rel.Routes...) + rc.Exposes = filterRefs(ctx, opts.AccessChecker, + toContextRefs(exposes), + "exposes", omitted) + + selected := make([]topology.ResourceRef, 0, len(rel.PDBs)+len(rel.NetworkPolicies)) + selected = append(selected, rel.PDBs...) + selected = append(selected, rel.NetworkPolicies...) + rc.SelectedBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(selected), + "selectedBy", omitted) + + rc.ScaledBy = filterRefs(ctx, opts.AccessChecker, + toContextRefs(rel.Scalers), + "scaledBy", omitted) + } + + // 3. Pod-specific: RunsOn (Node) + Uses (ConfigMap/Secret/PVC/SA). + // + // RunsOn and Uses.ServiceAccount come from topology.Relationships when + // available (T23 populates them from pod.Spec server-side). We still + // scan pod.Spec.Volumes / .EnvFrom directly for the ConfigMap/Secret/PVC + // inventory — topology doesn't model those use-edges at the granularity + // Build needs. + if pod, ok := obj.(*corev1.Pod); ok { + rc.Uses = buildUsesFromPod(ctx, pod, opts.AccessChecker, omitted) + rc.PodSummary = buildPodSummary(pod) + + // Prefer rel.ServiceAccount over re-reading pod.Spec — same source, + // but consolidating through Relationships keeps Build aligned with + // how MCP/agents consume the field. + if rc.Uses != nil && rc.Uses.ServiceAccount == nil && rel != nil && rel.ServiceAccount != nil { + candidate := &ContextRef{ + Kind: rel.ServiceAccount.Kind, + Group: rel.ServiceAccount.Group, + Namespace: rel.ServiceAccount.Namespace, + Name: rel.ServiceAccount.Name, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.Uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + // RunsOn: prefer the topology-supplied Node ref. Fall back to + // pod.Spec.NodeName any time rel.Node is empty — the Node informer + // may be cold, the node may not yet be in the topology graph, or + // rel itself may be nil. The previous `else if rel == nil` guard + // dropped the fallback when topology was built but rel.Node hadn't + // been populated yet, leaving RunsOn empty even though the Pod + // spec clearly named a node. + var nodeName, nodeGroup string + if rel != nil && rel.Node != nil { + nodeName = rel.Node.Name + nodeGroup = rel.Node.Group + } else { + nodeName = pod.Spec.NodeName + } + if nodeName != "" { + candidate := &ContextRef{ + Kind: "Node", + Group: nodeGroup, + Name: nodeName, + } + if checkRef(ctx, opts.AccessChecker, candidate) { + rc.RunsOn = candidate + } else { + omitted.add("runsOn", OmittedRBACDenied) + } + } + } + + if svc, ok := obj.(*corev1.Service); ok { + rc.ServiceSummary = buildServiceSummary(ctx, svc, opts.ServiceBackends, opts.AccessChecker, omitted) + } + if uses := buildUsesFromWorkload(ctx, obj, opts.AccessChecker, omitted); uses != nil { + rc.Uses = uses + } + rc.WorkloadSummary = buildWorkloadSummary(obj) + rc.IngressSummary = buildIngressSummary(ctx, obj, opts.AccessChecker, omitted) + rc.NodeSummary = buildNodeSummary(obj) + rc.PVCSummary = buildPVCSummary(obj) + rc.JobSummary = buildJobSummary(obj) + rc.CronJobSummary = buildCronJobSummary(ctx, obj, opts.AccessChecker, omitted) + rc.StatusSummary = buildStatusSummary(obj) + + // 4. Pre-computed summaries — pass-through. + rc.IssueSummary = opts.IssueSummary + rc.AuditSummary = opts.AuditSummary + + // 5. PolicyReports — Kyverno findings rolled up. Basic tier emits + // counts only (fail/warn/pass); diagnostic tier adds the top[] + // findings. Tier discrimination keeps the basic-tier wire size tight. + if opts.PolicyReports != nil { + findings := opts.PolicyReports.FindingsFor(ident.Group, ident.Kind, ident.Namespace, ident.Name) + if len(findings) > 0 { + rc.PolicySummary = buildPolicySummary(findings, opts.Tier) + } + } + + rc.Omitted = omitted.collect() + return rc +} + +// --------------------------------------------------------------------------- +// Identity extraction +// --------------------------------------------------------------------------- + +// resourceIdentity is the projection of obj that Build needs without holding +// on to the full runtime.Object. The (Kind, Namespace, Name) tuple keys +// topology relationship lookups and summary lookups; Group is retained for +// future use by callers inspecting the identity directly. +type resourceIdentity struct { + Kind string + Group string + Namespace string + Name string +} + +// identityOf extracts identity from a typed K8s object or unstructured. +// Returns (_, false) for unknown shapes so callers can short-circuit. +func identityOf(obj runtime.Object) (resourceIdentity, bool) { + if obj == nil { + return resourceIdentity{}, false + } + switch v := obj.(type) { + case *corev1.Pod: + return identFromMeta("Pod", "", &v.ObjectMeta), true + case *corev1.Service: + return identFromMeta("Service", "", &v.ObjectMeta), true + case *corev1.ConfigMap: + return identFromMeta("ConfigMap", "", &v.ObjectMeta), true + case *corev1.Secret: + return identFromMeta("Secret", "", &v.ObjectMeta), true + case *corev1.Node: + return identFromMeta("Node", "", &v.ObjectMeta), true + case *corev1.Namespace: + return identFromMeta("Namespace", "", &v.ObjectMeta), true + case *corev1.PersistentVolume: + return identFromMeta("PersistentVolume", "", &v.ObjectMeta), true + case *corev1.PersistentVolumeClaim: + return identFromMeta("PersistentVolumeClaim", "", &v.ObjectMeta), true + case *corev1.ServiceAccount: + return identFromMeta("ServiceAccount", "", &v.ObjectMeta), true + case *corev1.Event: + return identFromMeta("Event", "", &v.ObjectMeta), true + case *corev1.LimitRange: + return identFromMeta("LimitRange", "", &v.ObjectMeta), true + case *appsv1.Deployment: + return identFromMeta("Deployment", "apps", &v.ObjectMeta), true + case *appsv1.DaemonSet: + return identFromMeta("DaemonSet", "apps", &v.ObjectMeta), true + case *appsv1.StatefulSet: + return identFromMeta("StatefulSet", "apps", &v.ObjectMeta), true + case *appsv1.ReplicaSet: + return identFromMeta("ReplicaSet", "apps", &v.ObjectMeta), true + case *autoscalingv2.HorizontalPodAutoscaler: + return identFromMeta("HorizontalPodAutoscaler", "autoscaling", &v.ObjectMeta), true + case *batchv1.Job: + return identFromMeta("Job", "batch", &v.ObjectMeta), true + case *batchv1.CronJob: + return identFromMeta("CronJob", "batch", &v.ObjectMeta), true + case *networkingv1.Ingress: + return identFromMeta("Ingress", "networking.k8s.io", &v.ObjectMeta), true + case *networkingv1.NetworkPolicy: + return identFromMeta("NetworkPolicy", "networking.k8s.io", &v.ObjectMeta), true + case *policyv1.PodDisruptionBudget: + return identFromMeta("PodDisruptionBudget", "policy", &v.ObjectMeta), true + case *storagev1.StorageClass: + return identFromMeta("StorageClass", "storage.k8s.io", &v.ObjectMeta), true + case *rbacv1.Role: + return identFromMeta("Role", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRole: + return identFromMeta("ClusterRole", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.RoleBinding: + return identFromMeta("RoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *rbacv1.ClusterRoleBinding: + return identFromMeta("ClusterRoleBinding", "rbac.authorization.k8s.io", &v.ObjectMeta), true + case *unstructured.Unstructured: + gvk := v.GroupVersionKind() + return resourceIdentity{ + Kind: gvk.Kind, + Group: gvk.Group, + Namespace: v.GetNamespace(), + Name: v.GetName(), + }, true + } + return resourceIdentity{}, false +} + +func identFromMeta(kind, group string, m *metav1.ObjectMeta) resourceIdentity { + return resourceIdentity{ + Kind: kind, + Group: group, + Namespace: m.Namespace, + Name: m.Name, + } +} + +func ownerFromObject(obj runtime.Object, namespace string) *ContextRef { + m, ok := obj.(metav1.Object) + if !ok { + return nil + } + owners := m.GetOwnerReferences() + if len(owners) == 0 { + return nil + } + chosen := owners[0] + for _, owner := range owners { + if owner.Controller != nil && *owner.Controller { + chosen = owner + break + } + } + return &ContextRef{ + Kind: chosen.Kind, + Group: groupFromAPIVersion(chosen.APIVersion), + Namespace: namespace, + Name: chosen.Name, + } +} + +func groupFromAPIVersion(apiVersion string) string { + if apiVersion == "" || !strings.Contains(apiVersion, "/") { + return "" + } + return strings.SplitN(apiVersion, "/", 2)[0] +} + +// --------------------------------------------------------------------------- +// Uses (Pod-specific) +// --------------------------------------------------------------------------- + +// buildUsesFromPod extracts ConfigMap/Secret/PVC/ServiceAccount references +// from pod.Spec. Returns nil when the pod uses no configuration. +// +// Sources scanned: +// - Volumes: ConfigMap / Secret / PVC / Projected (configMap + secret entries) +// - Containers (init + regular): EnvFrom configMapRef/secretRef, Env valueFrom.{configMap,secret}KeyRef +// - Spec.ServiceAccountName +func buildUsesFromPod(ctx context.Context, pod *corev1.Pod, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + if pod == nil { + return nil + } + return buildUsesFromPodSpec(ctx, pod.Namespace, pod.Spec, ac, omitted) +} + +func buildUsesFromPodSpec(ctx context.Context, namespace string, spec corev1.PodSpec, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + cmSet := newRefSet() + secretSet := newRefSet() + pvcSet := newRefSet() + + scanVolumes(spec.Volumes, namespace, cmSet, secretSet, pvcSet) + scanContainers(spec.InitContainers, namespace, cmSet, secretSet) + scanContainers(spec.Containers, namespace, cmSet, secretSet) + + uses := &UsesBlock{ + ConfigMaps: filterRefs(ctx, ac, cmSet.refs("ConfigMap", ""), "uses.configMaps", omitted), + Secrets: filterRefs(ctx, ac, secretSet.refs("Secret", ""), "uses.secrets", omitted), + PVCs: filterRefs(ctx, ac, pvcSet.refs("PersistentVolumeClaim", ""), "uses.pvcs", omitted), + } + + if sa := spec.ServiceAccountName; sa != "" { + candidate := &ContextRef{ + Kind: "ServiceAccount", + Namespace: namespace, + Name: sa, + } + if checkRef(ctx, ac, candidate) { + uses.ServiceAccount = candidate + } else { + omitted.add("uses.serviceAccount", OmittedRBACDenied) + } + } + + if len(uses.ConfigMaps) == 0 && len(uses.Secrets) == 0 && len(uses.PVCs) == 0 && uses.ServiceAccount == nil { + return nil + } + return uses +} + +func buildUsesFromWorkload(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *UsesBlock { + switch v := obj.(type) { + case *appsv1.Deployment: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.StatefulSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.DaemonSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *appsv1.ReplicaSet: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *batchv1.Job: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.Template.Spec, ac, omitted) + case *batchv1.CronJob: + return buildUsesFromPodSpec(ctx, v.Namespace, v.Spec.JobTemplate.Spec.Template.Spec, ac, omitted) + default: + return nil + } +} + +func scanVolumes(vols []corev1.Volume, ns string, cm, secret, pvc *refSet) { + for _, v := range vols { + if v.ConfigMap != nil { + cm.add(v.ConfigMap.Name, ns) + } + if v.Secret != nil { + secret.add(v.Secret.SecretName, ns) + } + if v.PersistentVolumeClaim != nil { + pvc.add(v.PersistentVolumeClaim.ClaimName, ns) + } + if v.Projected != nil { + for _, src := range v.Projected.Sources { + if src.ConfigMap != nil { + cm.add(src.ConfigMap.Name, ns) + } + if src.Secret != nil { + secret.add(src.Secret.Name, ns) + } + } + } + } +} + +func scanContainers(containers []corev1.Container, ns string, cm, secret *refSet) { + for _, c := range containers { + for _, ef := range c.EnvFrom { + if ef.ConfigMapRef != nil { + cm.add(ef.ConfigMapRef.Name, ns) + } + if ef.SecretRef != nil { + secret.add(ef.SecretRef.Name, ns) + } + } + for _, e := range c.Env { + if e.ValueFrom == nil { + continue + } + if e.ValueFrom.ConfigMapKeyRef != nil { + cm.add(e.ValueFrom.ConfigMapKeyRef.Name, ns) + } + if e.ValueFrom.SecretKeyRef != nil { + secret.add(e.ValueFrom.SecretKeyRef.Name, ns) + } + } + } +} + +const maxServicePodRefs = 10 + +func buildServiceSummary(ctx context.Context, svc *corev1.Service, lookup ServiceBackendLookup, ac RefAccessChecker, omitted *omittedTracker) *ServiceSummary { + if svc == nil { + return nil + } + out := &ServiceSummary{} + + if len(svc.Spec.Selector) == 0 { + if svc.Spec.Type != corev1.ServiceTypeExternalName { + out.Warnings = append(out.Warnings, ServiceWarningNoSelector) + } + return out + } + if lookup == nil { + return nil + } + + selector := labels.SelectorFromSet(labels.Set(svc.Spec.Selector)) + pods, err := lookup.PodsForServiceSelector(svc.Namespace, selector) + if err != nil { + return nil + } + + sel := &PodSelectionSummary{Total: len(pods)} + for _, pod := range pods { + ref := ContextRef{Kind: "Pod", Namespace: pod.Namespace, Name: pod.Name} + if !checkRef(ctx, ac, &ref) { + omitted.add("serviceSummary.selectedPods", OmittedRBACDenied) + continue + } + if isPodReady(pod) { + sel.Ready++ + appendBoundedPodRef(&sel.ReadyPods, ref, sel) + } else { + sel.NotReady++ + appendBoundedPodRef(&sel.NotReadyPods, ref, sel) + } + } + + if sel.Total == 0 { + out.Warnings = append(out.Warnings, ServiceWarningNoSelectedPods) + } else if sel.Ready == 0 { + out.Warnings = append(out.Warnings, ServiceWarningNoReadyPods) + } + out.SelectedPods = sel + return out +} + +func appendBoundedPodRef(dst *[]ContextRef, ref ContextRef, sel *PodSelectionSummary) { + if len(*dst) >= maxServicePodRefs { + sel.Truncated = true + return + } + *dst = append(*dst, ref) +} + +func isPodReady(pod *corev1.Pod) bool { + if pod == nil { + return false + } + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady { + return cond.Status == corev1.ConditionTrue + } + } + return false +} + +const maxSummaryItems = 12 + +func buildPodSummary(pod *corev1.Pod) *PodSummary { + if pod == nil { + return nil + } + out := &PodSummary{ + Phase: string(pod.Status.Phase), + Ready: isPodReady(pod), + } + statuses := make([]corev1.ContainerStatus, 0, len(pod.Status.InitContainerStatuses)+len(pod.Status.ContainerStatuses)) + statuses = append(statuses, pod.Status.InitContainerStatuses...) + statuses = append(statuses, pod.Status.ContainerStatuses...) + if len(statuses) > maxSummaryItems { + statuses = statuses[:maxSummaryItems] + } + for _, st := range statuses { + out.RestartCount += st.RestartCount + out.Containers = append(out.Containers, containerStateSummary(st)) + } + return out +} + +func containerStateSummary(st corev1.ContainerStatus) ContainerStateSummary { + out := ContainerStateSummary{ + Name: st.Name, + Ready: st.Ready, + RestartCount: st.RestartCount, + } + switch { + case st.State.Waiting != nil: + out.State = "waiting" + out.Reason = st.State.Waiting.Reason + case st.State.Running != nil: + out.State = "running" + case st.State.Terminated != nil: + out.State = "terminated" + out.Reason = st.State.Terminated.Reason + } + if st.LastTerminationState.Terminated != nil { + out.LastTerminationReason = st.LastTerminationState.Terminated.Reason + } + return out +} + +func buildWorkloadSummary(obj runtime.Object) *WorkloadSummary { + switch v := obj.(type) { + case *appsv1.Deployment: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Updated: v.Status.UpdatedReplicas, + Unavailable: v.Status.UnavailableReplicas, + }} + case *appsv1.StatefulSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Updated: v.Status.UpdatedReplicas, + }} + case *appsv1.DaemonSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: v.Status.DesiredNumberScheduled, + Ready: v.Status.NumberReady, + Available: v.Status.NumberAvailable, + Updated: v.Status.UpdatedNumberScheduled, + Unavailable: v.Status.NumberUnavailable, + }} + case *appsv1.ReplicaSet: + return &WorkloadSummary{Replicas: &ReplicaSummary{ + Desired: replicasOrZero(v.Spec.Replicas), + Ready: v.Status.ReadyReplicas, + Available: v.Status.AvailableReplicas, + Unavailable: maxInt32(0, v.Status.Replicas-v.Status.AvailableReplicas), + }} + default: + return nil + } +} + +func buildIngressSummary(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *IngressSummary { + ing, ok := obj.(*networkingv1.Ingress) + if !ok || ing == nil { + return nil + } + out := &IngressSummary{} + if ing.Spec.IngressClassName != nil { + out.Class = *ing.Spec.IngressClassName + } else if ing.Annotations != nil { + out.Class = ing.Annotations["kubernetes.io/ingress.class"] + } + for _, addr := range ing.Status.LoadBalancer.Ingress { + if addr.IP != "" { + out.Addresses = append(out.Addresses, addr.IP) + } else if addr.Hostname != "" { + out.Addresses = append(out.Addresses, addr.Hostname) + } + } + if len(out.Addresses) == 0 { + out.Warnings = append(out.Warnings, IngressWarningNoAddress) + } + if out.Class == "" { + out.Warnings = append(out.Warnings, IngressWarningNoClass) + } + if len(ing.Spec.Rules) == 0 { + out.Warnings = append(out.Warnings, IngressWarningNoRules) + } + + svcSet := newRefSet() + addIngressBackendService(svcSet, ing.Namespace, ing.Spec.DefaultBackend) + for _, rule := range ing.Spec.Rules { + if rule.HTTP == nil { + continue + } + for _, path := range rule.HTTP.Paths { + addIngressBackendService(svcSet, ing.Namespace, &path.Backend) + } + } + out.BackendServices = filterRefs(ctx, ac, svcSet.refs("Service", ""), "ingressSummary.backendServices", omitted) + + secretSet := newRefSet() + for _, tls := range ing.Spec.TLS { + secretSet.add(tls.SecretName, ing.Namespace) + } + out.TLSSecrets = filterRefs(ctx, ac, secretSet.refs("Secret", ""), "ingressSummary.tlsSecrets", omitted) + + if out.Class == "" && len(out.Addresses) == 0 && len(out.BackendServices) == 0 && len(out.TLSSecrets) == 0 && len(out.Warnings) == 0 { + return nil + } + return out +} + +func addIngressBackendService(dst *refSet, namespace string, backend *networkingv1.IngressBackend) { + if backend == nil || backend.Service == nil { + return + } + dst.add(backend.Service.Name, namespace) +} + +func buildNodeSummary(obj runtime.Object) *NodeSummary { + node, ok := obj.(*corev1.Node) + if !ok || node == nil { + return nil + } + out := &NodeSummary{ + Unschedulable: node.Spec.Unschedulable, + Capacity: compactResourceList(node.Status.Capacity), + Allocatable: compactResourceList(node.Status.Allocatable), + } + if out.Unschedulable { + out.Warnings = append(out.Warnings, NodeWarningUnschedulable) + } + for _, cond := range node.Status.Conditions { + switch cond.Type { + case corev1.NodeReady: + out.ReadyStatus = string(cond.Status) + if cond.Status != corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningNotReady) + } + case corev1.NodeDiskPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningDiskPressure) + } + case corev1.NodeMemoryPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningMemoryPressure) + } + case corev1.NodePIDPressure: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningPIDPressure) + } + case corev1.NodeNetworkUnavailable: + if cond.Status == corev1.ConditionTrue { + out.Warnings = append(out.Warnings, NodeWarningNetworkUnavailable) + } + } + } + for _, taint := range node.Spec.Taints { + out.Taints = append(out.Taints, TaintSummary{ + Key: taint.Key, + Value: taint.Value, + Effect: string(taint.Effect), + }) + } + return out +} + +func compactResourceList(resources corev1.ResourceList) map[string]string { + if len(resources) == 0 { + return nil + } + out := make(map[string]string, 4) + for _, name := range []corev1.ResourceName{ + corev1.ResourceCPU, + corev1.ResourceMemory, + corev1.ResourcePods, + corev1.ResourceEphemeralStorage, + } { + if qty, ok := resources[name]; ok { + out[string(name)] = qty.String() + } + } + if len(out) == 0 { + return nil + } + return out +} + +func buildPVCSummary(obj runtime.Object) *PVCSummary { + pvc, ok := obj.(*corev1.PersistentVolumeClaim) + if !ok || pvc == nil { + return nil + } + out := &PVCSummary{ + Phase: string(pvc.Status.Phase), + StorageClassName: valueOrEmpty(pvc.Spec.StorageClassName), + VolumeName: pvc.Spec.VolumeName, + VolumeMode: string(valueOrZero(pvc.Spec.VolumeMode)), + } + if req, ok := pvc.Spec.Resources.Requests[corev1.ResourceStorage]; ok { + out.RequestedStorage = req.String() + } + if cap, ok := pvc.Status.Capacity[corev1.ResourceStorage]; ok { + out.CapacityStorage = cap.String() + } + for _, mode := range pvc.Spec.AccessModes { + out.AccessModes = append(out.AccessModes, string(mode)) + } + if pvc.Annotations != nil { + out.Provisioner = pvc.Annotations["volume.kubernetes.io/storage-provisioner"] + out.SelectedNode = pvc.Annotations["volume.kubernetes.io/selected-node"] + out.BindCompleted = pvc.Annotations["pv.kubernetes.io/bind-completed"] + } + switch pvc.Status.Phase { + case corev1.ClaimPending: + out.Warnings = append(out.Warnings, PVCWarningPending) + case corev1.ClaimLost: + out.Warnings = append(out.Warnings, PVCWarningLost) + } + return out +} + +func buildJobSummary(obj runtime.Object) *JobSummary { + job, ok := obj.(*batchv1.Job) + if !ok || job == nil { + return nil + } + out := &JobSummary{ + Active: job.Status.Active, + Succeeded: job.Status.Succeeded, + Failed: job.Status.Failed, + Completions: int32OrDefault(job.Spec.Completions, 1), + Parallelism: int32OrDefault(job.Spec.Parallelism, 1), + BackoffLimit: int32OrDefault(job.Spec.BackoffLimit, 6), + Suspended: boolOrFalse(job.Spec.Suspend), + } + return out +} + +func buildCronJobSummary(ctx context.Context, obj runtime.Object, ac RefAccessChecker, omitted *omittedTracker) *CronJobSummary { + cj, ok := obj.(*batchv1.CronJob) + if !ok || cj == nil { + return nil + } + out := &CronJobSummary{ + Schedule: cj.Spec.Schedule, + Suspended: boolOrFalse(cj.Spec.Suspend), + } + if cj.Status.LastScheduleTime != nil { + out.LastScheduleTime = cj.Status.LastScheduleTime.Format("2006-01-02T15:04:05Z07:00") + } + if cj.Status.LastSuccessfulTime != nil { + out.LastSuccessfulTime = cj.Status.LastSuccessfulTime.Format("2006-01-02T15:04:05Z07:00") + } + active := make([]ContextRef, 0, len(cj.Status.Active)) + for _, ref := range cj.Status.Active { + if ref.Kind == "" || ref.Name == "" { + continue + } + active = append(active, ContextRef{ + Kind: ref.Kind, + Group: groupFromAPIVersion(ref.APIVersion), + Namespace: cj.Namespace, + Name: ref.Name, + }) + } + out.ActiveJobs = filterRefs(ctx, ac, active, "cronJobSummary.activeJobs", omitted) + return out +} + +func replicasOrZero(p *int32) int32 { + if p == nil { + return 0 + } + return *p +} + +func int32OrDefault(p *int32, fallback int32) int32 { + if p == nil { + return fallback + } + return *p +} + +func boolOrFalse(p *bool) bool { + return p != nil && *p +} + +func valueOrEmpty(p *string) string { + if p == nil { + return "" + } + return *p +} + +func valueOrZero[T ~string](p *T) T { + var zero T + if p == nil { + return zero + } + return *p +} + +func maxInt32(a, b int32) int32 { + if a > b { + return a + } + return b +} + +func buildStatusSummary(obj runtime.Object) *StatusSummary { + if obj == nil { + return nil + } + u, ok := objectToMap(obj) + if !ok { + return nil + } + status, ok, _ := unstructured.NestedMap(u, "status") + if !ok { + return nil + } + out := &StatusSummary{} + if phase, ok, _ := unstructured.NestedString(status, "phase"); ok { + out.Phase = phase + } + if conditions, ok, _ := unstructured.NestedSlice(status, "conditions"); ok { + if len(conditions) > maxSummaryItems { + conditions = conditions[:maxSummaryItems] + } + for _, item := range conditions { + cond, ok := item.(map[string]interface{}) + if !ok { + continue + } + summary := ConditionSummary{ + Type: stringField(cond, "type"), + Status: stringField(cond, "status"), + Reason: stringField(cond, "reason"), + Message: truncateRunes(stringField(cond, "message"), 300), + LastTransitionTime: stringField(cond, "lastTransitionTime"), + } + if summary.Type == "" && summary.Status == "" { + continue + } + out.Conditions = append(out.Conditions, summary) + } + } + if out.Phase == "" && len(out.Conditions) == 0 { + return nil + } + return out +} + +func objectToMap(obj runtime.Object) (map[string]interface{}, bool) { + if u, ok := obj.(*unstructured.Unstructured); ok { + return u.Object, true + } + out, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, false + } + return out, true +} + +func stringField(m map[string]interface{}, key string) string { + if v, ok := m[key].(string); ok { + return v + } + return "" +} + +func truncateRunes(s string, limit int) string { + if limit <= 0 || len(s) == 0 { + return "" + } + runes := []rune(s) + if len(runes) <= limit { + return s + } + return string(runes[:limit]) +} + +// refSet collects (name, namespace) pairs with insertion-order preservation +// for deterministic output. Names with empty namespaces are tolerated (the +// PVC ClaimName can be cluster-scoped only in odd configurations, but we +// pass through whatever the pod spec says). +type refSet struct { + seen map[string]bool + order []nsName +} + +type nsName struct { + Namespace string + Name string +} + +func newRefSet() *refSet { + return &refSet{seen: make(map[string]bool)} +} + +func (s *refSet) add(name, ns string) { + if name == "" { + return + } + key := ns + "/" + name + if s.seen[key] { + return + } + s.seen[key] = true + s.order = append(s.order, nsName{Namespace: ns, Name: name}) +} + +// refs returns the accumulated set as ContextRefs sorted by (namespace, name) +// for deterministic golden output. +func (s *refSet) refs(kind, group string) []ContextRef { + if len(s.order) == 0 { + return nil + } + out := make([]ContextRef, len(s.order)) + sorted := append([]nsName(nil), s.order...) + sort.Slice(sorted, func(i, j int) bool { + if sorted[i].Namespace != sorted[j].Namespace { + return sorted[i].Namespace < sorted[j].Namespace + } + return sorted[i].Name < sorted[j].Name + }) + for i, e := range sorted { + out[i] = ContextRef{ + Kind: kind, + Group: group, + Namespace: e.Namespace, + Name: e.Name, + } + } + return out +} + +// --------------------------------------------------------------------------- +// Topology ref → ContextRef +// --------------------------------------------------------------------------- + +// toContextRefs translates a slice of topology.ResourceRef into ContextRefs. +// Sorted by (kind, namespace, name) for determinism — golden tests rely on +// this ordering. +func toContextRefs(refs []topology.ResourceRef) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + for _, r := range refs { + out = append(out, ContextRef{ + Kind: r.Kind, + Group: r.Group, + Namespace: r.Namespace, + Name: r.Name, + }) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Kind != out[j].Kind { + return out[i].Kind < out[j].Kind + } + if out[i].Namespace != out[j].Namespace { + return out[i].Namespace < out[j].Namespace + } + return out[i].Name < out[j].Name + }) + return out +} + +// --------------------------------------------------------------------------- +// RBAC gating +// --------------------------------------------------------------------------- + +// filterRefs applies the access check to each ref. Denied refs are dropped +// and one omitted entry is recorded per field (deduped by the tracker). +// When ac is nil (local-kubeconfig / no auth), every ref passes. +func filterRefs(ctx context.Context, ac RefAccessChecker, refs []ContextRef, fieldPath string, omitted *omittedTracker) []ContextRef { + if len(refs) == 0 { + return nil + } + out := make([]ContextRef, 0, len(refs)) + deniedAny := false + for _, r := range refs { + if !checkRef(ctx, ac, &r) { + deniedAny = true + continue + } + out = append(out, r) + } + if deniedAny { + omitted.add(fieldPath, OmittedRBACDenied) + } + if len(out) == 0 { + return nil + } + return out +} + +// checkRef returns true when ac permits a read of (group, kind, namespace). +// Nil ac = permit everything. +func checkRef(ctx context.Context, ac RefAccessChecker, r *ContextRef) bool { + if ac == nil || r == nil { + return true + } + return ac.CanRead(ctx, r.Group, r.Kind, r.Namespace) +} + +// --------------------------------------------------------------------------- +// Policy summary +// --------------------------------------------------------------------------- + +// buildPolicySummary rolls up Kyverno findings into the summary block. +// Top findings are picked first by fail > warn > error > pass, then by +// stable input order — capped at policySummaryTopMax. +// +// Tier discrimination: basic emits counts only (Fail/Warn/Pass) for a +// minimal wire footprint; diagnostic adds the Top[] findings. Locked +// in the plan's v1 contract. +const policySummaryTopMax = 3 + +func buildPolicySummary(findings []KyvernoFinding, tier ContextTier) *PolicySummary { + var fail, warn, pass int + for _, f := range findings { + switch f.Result { + case "fail": + fail++ + case "warn": + warn++ + case "pass": + pass++ + } + } + + ks := &KyvernoSummary{ + Fail: fail, + Warn: warn, + Pass: pass, + } + + // Top[] only on diagnostic tier. Basic stays counts-only. + if tier == TierDiagnostic { + ordered := append([]KyvernoFinding(nil), findings...) + sort.SliceStable(ordered, func(i, j int) bool { + return resultRank(ordered[i].Result) < resultRank(ordered[j].Result) + }) + if len(ordered) > policySummaryTopMax { + ordered = ordered[:policySummaryTopMax] + } + ks.Top = ordered + } + + return &PolicySummary{Kyverno: ks} +} + +func resultRank(r string) int { + switch r { + case "fail": + return 0 + case "warn": + return 1 + case "error": + return 2 + case "pass": + return 3 + default: + return 4 + } +} + +// --------------------------------------------------------------------------- +// Omitted tracker +// --------------------------------------------------------------------------- + +// omittedTracker deduplicates (field, reason) entries so callers don't emit +// "managedBy" / OmittedRBACDenied twice when multiple refs in the same field +// fail. Insertion order is preserved for stable JSON output. +type omittedTracker struct { + seen map[string]bool + items []OmittedField +} + +func newOmittedTracker() *omittedTracker { + return &omittedTracker{seen: make(map[string]bool)} +} + +func (t *omittedTracker) add(field string, reason OmittedReason) { + key := field + "|" + string(reason) + if t.seen[key] { + return + } + t.seen[key] = true + t.items = append(t.items, OmittedField{Field: field, Reason: reason}) +} + +func (t *omittedTracker) collect() []OmittedField { + if len(t.items) == 0 { + return nil + } + return t.items +} diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go new file mode 100644 index 000000000..0729bce78 --- /dev/null +++ b/pkg/resourcecontext/build_test.go @@ -0,0 +1,918 @@ +package resourcecontext + +import ( + "context" + "encoding/json" + "testing" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + + "github.com/skyhook-io/radar/pkg/topology" +) + +// --------------------------------------------------------------------------- +// Test scaffolding +// --------------------------------------------------------------------------- + +// allowAllChecker permits every CanRead check. Used by the happy-path +// goldens that don't exercise RBAC denial. +type allowAllChecker struct{} + +func (allowAllChecker) CanRead(_ context.Context, _, _, _ string) bool { return true } + +// denyChecker denies a specific (group, kind, namespace) tuple and permits +// everything else. Tests the "omitted: rbac_denied" path without requiring +// the full server stack. +type denyChecker struct { + group string + kind string + namespace string +} + +func (d denyChecker) CanRead(_ context.Context, group, kind, namespace string) bool { + return !(group == d.group && kind == d.kind && namespace == d.namespace) +} + +// mockPolicyReports implements PolicyReportLookup. +type mockPolicyReports map[string][]KyvernoFinding + +func (m mockPolicyReports) FindingsFor(group, kind, namespace, name string) []KyvernoFinding { + return m[kind+"/"+namespace+"/"+name] +} + +type mockServiceBackends []*corev1.Pod + +func (m mockServiceBackends) PodsForServiceSelector(namespace string, selector labels.Selector) ([]*corev1.Pod, error) { + out := make([]*corev1.Pod, 0, len(m)) + for _, pod := range m { + if pod.Namespace == namespace && selector.Matches(labels.Set(pod.Labels)) { + out = append(out, pod) + } + } + return out, nil +} + +// --------------------------------------------------------------------------- +// Golden-file tests +// --------------------------------------------------------------------------- + +func TestBuild_Pod_FullEnrichment(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-abc", + Namespace: "prod", + Labels: map[string]string{ + "app.kubernetes.io/name": "web", + }, + Annotations: map[string]string{ + "argocd.argoproj.io/tracking-id": "argocd_storefront:apps/Deployment:prod/web", + }, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "web-7d", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + ServiceAccountName: "web-sa", + Volumes: []corev1.Volume{ + { + Name: "config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "web-config"}, + }, + }, + }, + { + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }, + { + Name: "data", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "web-data"}, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "web", + EnvFrom: []corev1.EnvFromSource{ + {ConfigMapRef: &corev1.ConfigMapEnvSource{LocalObjectReference: corev1.LocalObjectReference{Name: "shared-env"}}}, + }, + Env: []corev1.EnvVar{ + { + Name: "API_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-key-secret"}, + Key: "key", + }, + }, + }, + }, + }, + }, + }, + } + + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/web-abc", Kind: topology.KindPod, Name: "web-abc"}, + {ID: "service/prod/web", Kind: topology.KindService, Name: "web"}, + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "poddisruptionbudget/prod/web-pdb", Kind: topology.KindPDB, Name: "web-pdb"}, + {ID: "horizontalpodautoscaler/prod/web-hpa", Kind: topology.KindHPA, Name: "web-hpa"}, + }, + Edges: []topology.Edge{ + {Source: "service/prod/web", Target: "pod/prod/web-abc", Type: topology.EdgeRoutesTo}, + {Source: "networkpolicy/prod/default-deny", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "poddisruptionbudget/prod/web-pdb", Target: "pod/prod/web-abc", Type: topology.EdgeProtects}, + {Source: "horizontalpodautoscaler/prod/web-hpa", Target: "pod/prod/web-abc", Type: topology.EdgeUses}, + }, + } + + opts := Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + IssueSummary: &IssueSummary{ + Count: 1, HighestSeverity: "critical", TopReason: "ImagePullBackOff", + BySource: map[string]int{"problem": 1}, + }, + } + + rc := Build(context.Background(), pod, opts) + if rc == nil { + t.Fatal("Build returned nil") + } + + // ManagedBy: argo tracking-id annotation wins over owner reference. + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d (%+v)", got, want, rc.ManagedBy) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Application" || mb.Name != "storefront" || mb.Namespace != "argocd" { + t.Errorf("ManagedBy[0]: got %+v, want Application argocd/storefront", mb) + } + + // Exposes: the Service routes to the pod. + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d (%+v)", got, want, rc.Exposes) + } + if rc.Exposes[0].Kind != "Service" || rc.Exposes[0].Name != "web" { + t.Errorf("Exposes[0]: got %+v want Service/prod/web", rc.Exposes[0]) + } + + // SelectedBy: NP + PDB, sorted by kind (NetworkPolicy < PodDisruptionBudget). + if got, want := len(rc.SelectedBy), 2; got != want { + t.Fatalf("SelectedBy len: got %d want %d (%+v)", got, want, rc.SelectedBy) + } + if rc.SelectedBy[0].Kind != "NetworkPolicy" || rc.SelectedBy[1].Kind != "PodDisruptionBudget" { + t.Errorf("SelectedBy order: got %s,%s want NetworkPolicy,PodDisruptionBudget", + rc.SelectedBy[0].Kind, rc.SelectedBy[1].Kind) + } + + // ScaledBy: HPA. + if got, want := len(rc.ScaledBy), 1; got != want { + t.Fatalf("ScaledBy len: got %d want %d", got, want) + } + if rc.ScaledBy[0].Kind != "HorizontalPodAutoscaler" { + t.Errorf("ScaledBy[0].Kind: got %q", rc.ScaledBy[0].Kind) + } + + // RunsOn: Node. + if rc.RunsOn == nil || rc.RunsOn.Name != "node-1" { + t.Errorf("RunsOn: got %+v want Node/node-1", rc.RunsOn) + } + if rc.Owner == nil || rc.Owner.Kind != "ReplicaSet" || rc.Owner.Name != "web-7d" || rc.Owner.Group != "apps" { + t.Errorf("Owner: got %+v want apps/ReplicaSet prod/web-7d", rc.Owner) + } + + // Uses: 2 ConfigMaps (web-config + shared-env), 2 Secrets (web-creds + api-key-secret), 1 PVC, ServiceAccount. + if rc.Uses == nil { + t.Fatal("Uses: got nil") + } + if got, want := len(rc.Uses.ConfigMaps), 2; got != want { + t.Errorf("Uses.ConfigMaps len: got %d want %d (%+v)", got, want, rc.Uses.ConfigMaps) + } + if got, want := len(rc.Uses.Secrets), 2; got != want { + t.Errorf("Uses.Secrets len: got %d want %d (%+v)", got, want, rc.Uses.Secrets) + } + if got, want := len(rc.Uses.PVCs), 1; got != want { + t.Errorf("Uses.PVCs len: got %d want %d", got, want) + } + if rc.Uses.ServiceAccount == nil || rc.Uses.ServiceAccount.Name != "web-sa" { + t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) + } + + // Pre-computed summaries are passed through. + if rc.IssueSummary == nil || rc.IssueSummary.Count != 1 { + t.Errorf("IssueSummary not passed through: %+v", rc.IssueSummary) + } + if rc.AuditSummary != nil { + t.Errorf("AuditSummary: want nil, got %+v", rc.AuditSummary) + } +} + +func TestBuild_Deployment_OwnerRefHelmRelease(t *testing.T) { + // Flux HelmRelease labels take precedence over owner references — + // owner is "ReplicaSet web-7d" but Flux labels point at HelmRelease. + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web", + Namespace: "prod", + Labels: map[string]string{ + "helm.toolkit.fluxcd.io/name": "web", + "helm.toolkit.fluxcd.io/namespace": "flux-system", + }, + }, + } + + rc := Build(context.Background(), dep, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "HelmRelease" || mb.Name != "web" || mb.Namespace != "flux-system" { + t.Errorf("ManagedBy[0]: got %+v want HelmRelease/flux-system/web", mb) + } + if mb.Group != "helm.toolkit.fluxcd.io" { + t.Errorf("ManagedBy[0].Group: got %q", mb.Group) + } +} + +func TestBuild_Deployment_WorkloadSummaryAndTemplateUses(t *testing.T) { + replicas := int32(3) + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + ServiceAccountName: "api-sa", + Volumes: []corev1.Volume{{ + Name: "settings", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-settings"}, + }, + }, + }}, + Containers: []corev1.Container{{ + Name: "api", + Env: []corev1.EnvVar{{ + Name: "TOKEN", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api-token"}, + Key: "token", + }, + }, + }}, + }}, + }, + }, + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 2, + AvailableReplicas: 2, + UpdatedReplicas: 3, + UnavailableReplicas: 1, + }, + } + + rc := Build(context.Background(), dep, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.WorkloadSummary == nil || rc.WorkloadSummary.Replicas == nil { + t.Fatalf("WorkloadSummary.Replicas: got nil; rc=%+v", rc) + } + rep := rc.WorkloadSummary.Replicas + if rep.Desired != 3 || rep.Ready != 2 || rep.Available != 2 || rep.Updated != 3 || rep.Unavailable != 1 { + t.Errorf("Replicas: got %+v", rep) + } + if rc.Uses == nil { + t.Fatal("Uses: got nil") + } + if got, want := len(rc.Uses.ConfigMaps), 1; got != want { + t.Errorf("Uses.ConfigMaps len: got %d want %d (%+v)", got, want, rc.Uses.ConfigMaps) + } + if got, want := len(rc.Uses.Secrets), 1; got != want { + t.Errorf("Uses.Secrets len: got %d want %d (%+v)", got, want, rc.Uses.Secrets) + } + if rc.Uses.ServiceAccount == nil || rc.Uses.ServiceAccount.Name != "api-sa" { + t.Errorf("Uses.ServiceAccount: got %+v", rc.Uses.ServiceAccount) + } +} + +func TestBuild_Pod_PodSummary(t *testing.T) { + pod := readyPod("api-1", "prod", map[string]string{"app": "api"}, true) + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: "api", + Ready: false, + RestartCount: 2, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}, + }, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "Error"}, + }, + }, + { + Name: "sidecar", + Ready: true, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }, + } + + rc := Build(context.Background(), pod, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.PodSummary == nil { + t.Fatal("PodSummary: got nil") + } + if rc.PodSummary.Phase != "Running" || !rc.PodSummary.Ready || rc.PodSummary.RestartCount != 2 { + t.Errorf("PodSummary: got %+v", rc.PodSummary) + } + if got, want := len(rc.PodSummary.Containers), 2; got != want { + t.Fatalf("Containers len: got %d want %d", got, want) + } + c := rc.PodSummary.Containers[0] + if c.State != "waiting" || c.Reason != "CrashLoopBackOff" || c.LastTerminationReason != "Error" { + t.Errorf("Container[0]: got %+v", c) + } +} + +func TestBuild_Service_ExposedByIngress(t *testing.T) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{"app": "api"}, + }, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "service/prod/api", Kind: topology.KindService, Name: "api"}, + {ID: "ingress/prod/api-ingress", Kind: topology.KindIngress, Name: "api-ingress"}, + }, + Edges: []topology.Edge{ + {Source: "ingress/prod/api-ingress", Target: "service/prod/api", Type: topology.EdgeRoutesTo}, + }, + } + rc := Build(context.Background(), svc, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + ServiceBackends: mockServiceBackends{ + readyPod("api-1", "prod", map[string]string{"app": "api"}, true), + readyPod("api-2", "prod", map[string]string{"app": "api"}, false), + readyPod("other", "prod", map[string]string{"app": "other"}, true), + }, + }) + + if got, want := len(rc.Exposes), 1; got != want { + t.Fatalf("Exposes len: got %d want %d", got, want) + } + if rc.Exposes[0].Kind != "Ingress" || rc.Exposes[0].Name != "api-ingress" { + t.Errorf("Exposes[0]: got %+v", rc.Exposes[0]) + } + // Service has no Uses block — make sure we don't synthesize an empty one. + if rc.Uses != nil { + t.Errorf("Uses should be nil for Service: got %+v", rc.Uses) + } + if rc.ServiceSummary == nil || rc.ServiceSummary.SelectedPods == nil { + t.Fatalf("ServiceSummary.SelectedPods: got nil; rc=%+v", rc) + } + if got, want := rc.ServiceSummary.SelectedPods.Total, 2; got != want { + t.Errorf("SelectedPods.Total: got %d want %d", got, want) + } + if got, want := rc.ServiceSummary.SelectedPods.Ready, 1; got != want { + t.Errorf("SelectedPods.Ready: got %d want %d", got, want) + } + if got, want := rc.ServiceSummary.SelectedPods.NotReady, 1; got != want { + t.Errorf("SelectedPods.NotReady: got %d want %d", got, want) + } + if len(rc.ServiceSummary.Warnings) != 0 { + t.Errorf("ServiceSummary.Warnings: got %+v want none", rc.ServiceSummary.Warnings) + } +} + +func TestBuild_Service_NoReadyPodsWarning(t *testing.T) { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{"app": "api"}, + }, + } + rc := Build(context.Background(), svc, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + ServiceBackends: mockServiceBackends{ + readyPod("api-1", "prod", map[string]string{"app": "api"}, false), + }, + }) + if rc.ServiceSummary == nil { + t.Fatal("ServiceSummary: got nil") + } + if got := rc.ServiceSummary.Warnings; len(got) != 1 || got[0] != ServiceWarningNoReadyPods { + t.Fatalf("Warnings: got %+v want [%s]", got, ServiceWarningNoReadyPods) + } +} + +func TestBuild_IngressSummary_BackendsTLSAndWarnings(t *testing.T) { + className := "nginx" + ing := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "prod"}, + Spec: networkingv1.IngressSpec{ + IngressClassName: &className, + DefaultBackend: &networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{Name: "fallback"}, + }, + Rules: []networkingv1.IngressRule{{ + Host: "example.com", + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{Paths: []networkingv1.HTTPIngressPath{{ + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{Name: "web"}, + }, + }}}, + }, + }}, + TLS: []networkingv1.IngressTLS{{SecretName: "web-tls"}}, + }, + Status: networkingv1.IngressStatus{LoadBalancer: networkingv1.IngressLoadBalancerStatus{ + Ingress: []networkingv1.IngressLoadBalancerIngress{{Hostname: "lb.example.com"}}, + }}, + } + + rc := Build(context.Background(), ing, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.IngressSummary == nil { + t.Fatal("IngressSummary: got nil") + } + if rc.IngressSummary.Class != "nginx" { + t.Errorf("Class: got %q", rc.IngressSummary.Class) + } + if got, want := len(rc.IngressSummary.BackendServices), 2; got != want { + t.Fatalf("BackendServices len: got %d want %d (%+v)", got, want, rc.IngressSummary.BackendServices) + } + if got, want := len(rc.IngressSummary.TLSSecrets), 1; got != want { + t.Fatalf("TLSSecrets len: got %d want %d", got, want) + } + if len(rc.IngressSummary.Warnings) != 0 { + t.Errorf("Warnings: got %+v want none", rc.IngressSummary.Warnings) + } +} + +func TestBuild_NodeSummary(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Spec: corev1.NodeSpec{ + Unschedulable: true, + Taints: []corev1.Taint{{ + Key: "dedicated", + Value: "batch", + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourcePods: resource.MustParse("110"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("3900m"), + corev1.ResourceMemory: resource.MustParse("14Gi"), + }, + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionFalse}, + {Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue}, + }, + }, + } + + rc := Build(context.Background(), node, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.NodeSummary == nil { + t.Fatal("NodeSummary: got nil") + } + if rc.NodeSummary.ReadyStatus != "False" || !rc.NodeSummary.Unschedulable { + t.Errorf("NodeSummary: got %+v", rc.NodeSummary) + } + if rc.NodeSummary.Capacity["cpu"] != "4" || rc.NodeSummary.Allocatable["memory"] != "14Gi" { + t.Errorf("Capacity/Allocatable: got %+v / %+v", rc.NodeSummary.Capacity, rc.NodeSummary.Allocatable) + } + if got, want := len(rc.NodeSummary.Taints), 1; got != want { + t.Fatalf("Taints len: got %d want %d", got, want) + } + if got := rc.NodeSummary.Warnings; len(got) != 3 { + t.Errorf("Warnings: got %+v want unschedulable/not_ready/memory_pressure", got) + } +} + +func TestBuild_PVCSummary(t *testing.T) { + storageClass := "standard" + volumeMode := corev1.PersistentVolumeFilesystem + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "data", + Namespace: "prod", + Annotations: map[string]string{ + "volume.kubernetes.io/storage-provisioner": "pd.csi.storage.gke.io", + "volume.kubernetes.io/selected-node": "node-1", + "pv.kubernetes.io/bind-completed": "yes", + }, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: &storageClass, + VolumeName: "pv-data", + VolumeMode: &volumeMode, + AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("10Gi")}, + }, + }, + Status: corev1.PersistentVolumeClaimStatus{ + Phase: corev1.ClaimPending, + Capacity: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("8Gi")}, + }, + } + + rc := Build(context.Background(), pvc, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.PVCSummary == nil { + t.Fatal("PVCSummary: got nil") + } + if rc.PVCSummary.Phase != "Pending" || rc.PVCSummary.RequestedStorage != "10Gi" || rc.PVCSummary.CapacityStorage != "8Gi" { + t.Errorf("PVCSummary: got %+v", rc.PVCSummary) + } + if got := rc.PVCSummary.Warnings; len(got) != 1 || got[0] != PVCWarningPending { + t.Errorf("Warnings: got %+v", got) + } +} + +func TestBuild_JobAndCronJobSummary(t *testing.T) { + completions := int32(5) + parallelism := int32(2) + backoff := int32(3) + suspend := true + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod"}, + Spec: batchv1.JobSpec{ + Completions: &completions, + Parallelism: ¶llelism, + BackoffLimit: &backoff, + Suspend: &suspend, + }, + Status: batchv1.JobStatus{Active: 1, Succeeded: 2, Failed: 1}, + } + rc := Build(context.Background(), job, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.JobSummary == nil { + t.Fatal("JobSummary: got nil") + } + if rc.JobSummary.Completions != 5 || rc.JobSummary.Parallelism != 2 || rc.JobSummary.BackoffLimit != 3 || !rc.JobSummary.Suspended { + t.Errorf("JobSummary: got %+v", rc.JobSummary) + } + + cj := &batchv1.CronJob{ + ObjectMeta: metav1.ObjectMeta{Name: "nightly", Namespace: "prod"}, + Spec: batchv1.CronJobSpec{Schedule: "0 0 * * *", Suspend: &suspend}, + Status: batchv1.CronJobStatus{ + Active: []corev1.ObjectReference{{ + APIVersion: "batch/v1", + Kind: "Job", + Name: "nightly-1", + }}, + }, + } + rc = Build(context.Background(), cj, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.CronJobSummary == nil { + t.Fatal("CronJobSummary: got nil") + } + if rc.CronJobSummary.Schedule != "0 0 * * *" || !rc.CronJobSummary.Suspended { + t.Errorf("CronJobSummary: got %+v", rc.CronJobSummary) + } + if got, want := len(rc.CronJobSummary.ActiveJobs), 1; got != want { + t.Fatalf("ActiveJobs len: got %d want %d", got, want) + } +} + +func TestBuild_NetworkPolicy_OutgoingEdgeNotSurfaced(t *testing.T) { + // NetworkPolicy on the "policy side" emits an outgoing EdgeProtects to + // the workload it selects. The topology relationships projection does + // NOT surface that direction (see relationships.go's intentional skip). + // Build inherits this — the NP should have nothing in SelectedBy. + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "default-deny", Namespace: "prod"}, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "networkpolicy/prod/default-deny", Kind: topology.KindNetworkPolicy, Name: "default-deny"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "networkpolicy/prod/default-deny", Target: "deployment/prod/web", Type: topology.EdgeProtects}, + }, + } + rc := Build(context.Background(), np, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + }) + if rc == nil { + t.Fatal("Build returned nil") + } + if len(rc.SelectedBy) != 0 { + t.Errorf("SelectedBy: expected empty (outgoing EdgeProtects not surfaced), got %+v", rc.SelectedBy) + } +} + +func TestBuild_ConfigMap_OwnerOnly(t *testing.T) { + // A ConfigMap owned by a Deployment via EdgeManages — owner-chain + // ManagedBy is sourced from topology.SynthesizeManagedBy walking the + // owner graph (T23 canonical projection). No Pod spec, no GitOps + // labels — just the topology owner edge. + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "web-config", + Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "Deployment", APIVersion: "apps/v1", Name: "web", Controller: ptrBool(true)}, + }, + }, + } + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "configmap/prod/web-config", Kind: topology.KindConfigMap, Name: "web-config"}, + {ID: "deployment/prod/web", Kind: topology.KindDeployment, Name: "web"}, + }, + Edges: []topology.Edge{ + {Source: "deployment/prod/web", Target: "configmap/prod/web-config", Type: topology.EdgeManages}, + }, + } + rc := Build(context.Background(), cm, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + }) + if got, want := len(rc.ManagedBy), 1; got != want { + t.Fatalf("ManagedBy len: got %d want %d", got, want) + } + mb := rc.ManagedBy[0] + if mb.Kind != "Deployment" || mb.Name != "web" || mb.Namespace != "prod" { + t.Errorf("ManagedBy[0]: got %+v", mb) + } +} + +func TestBuild_RBACDenied_AppendsOmitted(t *testing.T) { + // Deny reads on Secrets in the pod's namespace — buildUsesFromPod + // should drop them all and emit an omitted entry. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}, + Spec: corev1.PodSpec{ + Volumes: []corev1.Volume{{ + Name: "creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "web-creds"}, + }, + }}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: denyChecker{group: "", kind: "Secret", namespace: "prod"}, + }) + if rc.Uses != nil && len(rc.Uses.Secrets) != 0 { + t.Errorf("Secrets should be empty after deny; got %+v", rc.Uses.Secrets) + } + gotOmitted := false + for _, o := range rc.Omitted { + if o.Field == "uses.secrets" && o.Reason == OmittedRBACDenied { + gotOmitted = true + break + } + } + if !gotOmitted { + t.Errorf("expected omitted [uses.secrets, rbac_denied]; got %+v", rc.Omitted) + } +} + +func TestBuild_NilObj(t *testing.T) { + if rc := Build(context.Background(), nil, Options{}); rc != nil { + t.Errorf("Build(nil) = %+v, want nil", rc) + } +} + +func TestBuild_HPA_Identity(t *testing.T) { + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "web-hpa", Namespace: "prod"}, + } + rc := Build(context.Background(), hpa, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc == nil { + t.Fatal("Build returned nil for HPA") + } + if rc.Tier != TierBasic { + t.Errorf("Tier: got %q want %q", rc.Tier, TierBasic) + } +} + +func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { + // Basic tier emits counts only (fail/warn/pass). Top[] is reserved + // for diagnostic tier — keeps the basic-tier wire footprint minimal. + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) != 0 { + t.Errorf("basic tier must NOT emit Top[]; got %d entries: %+v", len(k.Top), k.Top) + } +} + +func TestBuild_PolicyReports_DiagnosticTierIncludesTop(t *testing.T) { + // Diagnostic tier adds the Top[] findings (capped at 3, ordered + // fail > warn > error > pass). Used by the deep agent investigation + // path — basic tier is for everyday triage. + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "p", Namespace: "prod"}} + reports := mockPolicyReports{ + "Pod/prod/p": { + {Policy: "require-labels", Rule: "check-app", Result: "fail", Message: "missing label"}, + {Policy: "require-labels", Rule: "check-env", Result: "warn"}, + {Policy: "no-host-network", Rule: "main", Result: "pass"}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierDiagnostic, + AccessChecker: allowAllChecker{}, + PolicyReports: reports, + }) + if rc.PolicySummary == nil || rc.PolicySummary.Kyverno == nil { + t.Fatalf("PolicySummary.Kyverno: got nil; rc=%+v", rc) + } + k := rc.PolicySummary.Kyverno + if k.Fail != 1 || k.Warn != 1 || k.Pass != 1 { + t.Errorf("Kyverno counts: got fail=%d warn=%d pass=%d", k.Fail, k.Warn, k.Pass) + } + if len(k.Top) == 0 { + t.Fatal("diagnostic tier must emit Top[] findings") + } + if k.Top[0].Result != "fail" { + t.Errorf("Top[0] should be the failing finding; got %+v", k.Top) + } +} + +func TestBuild_PDB_OutputJSONShape(t *testing.T) { + // Pin the wire shape one full populated Build produces, so a future + // reorder of fields (or accidental omitempty change) is caught. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", Namespace: "prod", + OwnerReferences: []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "apps/v1", Name: "rs", Controller: ptrBool(true)}, + }, + }, + Spec: corev1.PodSpec{NodeName: "n1"}, + } + // Topology with the owner edge so SynthesizeManagedBy can walk the + // chain and emit a ReplicaSet ManagedBy ref for wire-shape coverage. + topo := &topology.Topology{ + Nodes: []topology.Node{ + {ID: "pod/prod/p", Kind: topology.KindPod, Name: "p"}, + {ID: "replicaset/prod/rs", Kind: topology.KindReplicaSet, Name: "rs"}, + }, + Edges: []topology.Edge{ + {Source: "replicaset/prod/rs", Target: "pod/prod/p", Type: topology.EdgeManages}, + }, + } + rc := Build(context.Background(), pod, Options{ + Tier: TierBasic, + AccessChecker: allowAllChecker{}, + Topology: topo, + }) + b, err := json.MarshalIndent(rc, "", " ") + if err != nil { + t.Fatalf("marshal: %v", err) + } + // Spot-check: tier basic, owner ref managedBy, runsOn node. + want := `"managedBy"` + if !contains(string(b), want) { + t.Errorf("JSON missing %s\n%s", want, b) + } + if !contains(string(b), `"tier": "basic"`) { + t.Errorf("JSON missing tier=basic\n%s", b) + } + if !contains(string(b), `"runsOn"`) { + t.Errorf("JSON missing runsOn\n%s", b) + } +} + +func TestBuild_Unstructured_StatusSummary(t *testing.T) { + obj := &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "example.com/v1", + "kind": "Widget", + "metadata": map[string]interface{}{ + "name": "w1", + "namespace": "prod", + }, + "status": map[string]interface{}{ + "phase": "Reconciling", + "conditions": []interface{}{ + map[string]interface{}{ + "type": "Ready", + "status": "False", + "reason": "DependencyMissing", + "message": "waiting for dependency", + "lastTransitionTime": "2026-05-21T10:00:00Z", + }, + }, + }, + }} + + rc := Build(context.Background(), obj, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) + if rc.StatusSummary == nil { + t.Fatal("StatusSummary: got nil") + } + if rc.StatusSummary.Phase != "Reconciling" { + t.Errorf("Phase: got %q", rc.StatusSummary.Phase) + } + if got, want := len(rc.StatusSummary.Conditions), 1; got != want { + t.Fatalf("Conditions len: got %d want %d", got, want) + } + cond := rc.StatusSummary.Conditions[0] + if cond.Type != "Ready" || cond.Status != "False" || cond.Reason != "DependencyMissing" { + t.Errorf("Condition: got %+v", cond) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func ptrBool(b bool) *bool { return &b } + +func readyPod(name, namespace string, podLabels map[string]string, ready bool) *corev1.Pod { + status := corev1.ConditionFalse + if ready { + status = corev1.ConditionTrue + } + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: podLabels, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: status, + }}, + }, + } +} + +func contains(s, sub string) bool { + return len(s) >= len(sub) && indexOf(s, sub) >= 0 +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} + +// Compile-time pin: keep PDB and Networking imports referenced for future tests. +var ( + _ = policyv1.PodDisruptionBudget{} +) diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go new file mode 100644 index 000000000..15302bff0 --- /dev/null +++ b/pkg/resourcecontext/summary.go @@ -0,0 +1,214 @@ +package resourcecontext + +import ( + "strings" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// SummaryOptions configures the compact per-result enrichment produced by +// BuildSummary. All fields are pre-computed by the caller — this +// package never touches the issue engine, topology builder, or audit +// cache directly. Handlers in internal/* (REST list, MCP list_resources, +// search) walk the per-request topology + issue indexes once and pass +// the per-result digest in here. +type SummaryOptions struct { + // ManagedBy is the compact owner/GitOps pointer attached to the summary. + // Callers derive this from topology.Relationships via + // ManagedByFromOwner; nil leaves the field absent. + ManagedBy *ManagedByRef + + // IssueCount is the count of internal issue-engine findings scoped to + // the subject resource. Callers pre-compute a per-namespace index + // (e.g. via internal/issues.ComposeWithStats) once per request and + // pass the count in for each result. Zero omits the field. + IssueCount int + + // Health, when non-empty, overrides the derived health string. The + // default is computed from resource status via deriveHealth — Pod + // container readiness, replica-count workloads, and the standard + // Ready/Available condition on CRDs. Non-trivial kinds derive to "". + Health string +} + +// BuildSummary produces the compact per-result ResourceSummaryContext +// attached to list_resources, /api/ai/resources/{kind} list, and search +// hits. +// +// Tightly bounded — only the triage fields needed to choose a next hop. +// Returns nil when all three fields would be empty so callers can +// `omitempty` the entire object on bare results and keep the wire shape minimal. +func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryContext { + health := opts.Health + if health == "" { + health = deriveHealth(obj) + } + if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { + return nil + } + return &ResourceSummaryContext{ + ManagedBy: opts.ManagedBy, + Health: health, + IssueCount: opts.IssueCount, + } +} + +// ManagedByFromOwner assembles a compact ManagedByRef from raw owner +// fields (typically pulled out of topology.Relationships in the handler). +// Returns nil when ownerKind or ownerName is empty so callers don't +// have to guard the assignment. +// +// Source classification: +// - "argocd" for argoproj.io kinds (Application, ApplicationSet, Rollout) +// - "flux" for *.fluxcd.io kinds (Kustomization, HelmRelease, GitRepository, …) +// - "helm" for the native Helm release pseudo-owner (kind "HelmRelease" +// with no group — emitted by topology's detectManagedByFromMeta to +// distinguish from Flux's HelmRelease CR in helm.toolkit.fluxcd.io) +// - "native" for everything else (Deployment, StatefulSet, DaemonSet, ReplicaSet, Job, …) +func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) *ManagedByRef { + if ownerKind == "" || ownerName == "" { + return nil + } + return &ManagedByRef{ + Kind: ownerKind, + Source: sourceForOwner(ownerKind, ownerGroup), + Name: ownerName, + Namespace: ownerNamespace, + } +} + +func sourceForOwner(ownerKind, group string) string { + // Native Helm install: topology synthesizes a {Kind:"HelmRelease", Group:""} + // pseudo-owner from Helm's release-name/namespace annotations. This must + // be classified BEFORE the group-based GitOps branches so we don't fall + // through to "native" — Flux's HelmRelease lives at helm.toolkit.fluxcd.io + // and is handled by the *.fluxcd.io branch below. + if ownerKind == "HelmRelease" && group == "" { + return "helm" + } + switch group { + case "argoproj.io": + return "argocd" + } + if strings.HasSuffix(group, ".fluxcd.io") { + return "flux" + } + return "native" +} + +// deriveHealth applies a tiny per-kind heuristic to classify a resource +// as "healthy" | "degraded" | "unhealthy". Kinds we don't recognize +// derive to "" and the field is omitted on the wire. +// +// Vocabulary matches the broader status-tone scheme used across the UI +// (k8s-ui StatusTone) so consumers don't need to translate. +func deriveHealth(obj runtime.Object) string { + if obj == nil { + return "" + } + switch o := obj.(type) { + case *corev1.Pod: + return podHealth(o) + case *appsv1.Deployment: + // Use Spec.Replicas (desired) not Status.Replicas (current). During + // scale-down or rolling updates, Status.Replicas can exceed + // Spec.Replicas while terminating pods drain; comparing ReadyReplicas + // against Status.Replicas would falsely report "degraded" when all + // desired replicas are actually ready. Matches StatefulSet semantics. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *appsv1.StatefulSet: + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *appsv1.DaemonSet: + return replicasHealth(o.Status.NumberReady, o.Status.DesiredNumberScheduled) + case *appsv1.ReplicaSet: + // Same Spec-vs-Status concern as Deployment above. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *unstructured.Unstructured: + return unstructuredHealth(o) + } + return "" +} + +func podHealth(p *corev1.Pod) string { + switch p.Status.Phase { + case corev1.PodRunning: + if len(p.Status.ContainerStatuses) == 0 { + return "degraded" + } + for _, cs := range p.Status.ContainerStatuses { + if !cs.Ready { + return "degraded" + } + } + return "healthy" + case corev1.PodSucceeded: + return "healthy" + case corev1.PodFailed: + return "unhealthy" + case corev1.PodPending: + return "degraded" + } + return "" +} + +func replicasHealth(ready, desired int32) string { + if desired <= 0 { + return "" + } + if ready >= desired { + return "healthy" + } + if ready <= 0 { + return "unhealthy" + } + return "degraded" +} + +// unstructuredHealth derives health for CRDs that follow the standard +// Ready/Available condition pattern. Returns "" for kinds without a +// matching condition so we don't emit a misleading status for resources +// whose status shape we don't understand. +func unstructuredHealth(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + conditions, found, _ := unstructured.NestedSlice(u.Object, "status", "conditions") + if !found || len(conditions) == 0 { + return "" + } + for _, c := range conditions { + cond, ok := c.(map[string]any) + if !ok { + continue + } + condType, _ := cond["type"].(string) + if condType != "Ready" && condType != "Available" { + continue + } + status, _ := cond["status"].(string) + switch status { + case "True": + return "healthy" + case "False": + return "unhealthy" + default: + return "degraded" + } + } + return "" +} diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go new file mode 100644 index 000000000..96c4979be --- /dev/null +++ b/pkg/resourcecontext/summary_test.go @@ -0,0 +1,391 @@ +package resourcecontext + +import ( + "encoding/json" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// TestBuildSummary_NilWhenEmpty pins that BuildSummary returns nil when +// every field would be empty — keeps the per-row JSON minimal. +func TestBuildSummary_NilWhenEmpty(t *testing.T) { + // ConfigMap has no health heuristic and no caller-supplied options. + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "x", Namespace: "y"}} + if got := BuildSummary(cm, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(ConfigMap, {}) = %#v, want nil", got) + } +} + +// TestBuildSummary_PodGoldens golden-files BuildSummary across the +// Pod phases that drive the health heuristic. Locks the wire shape +// for the common "list pods" call. +func TestBuildSummary_PodGoldens(t *testing.T) { + cases := []struct { + name string + pod *corev1.Pod + opts SummaryOptions + want string + }{ + { + name: "running_all_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + {Name: "sidecar", Ready: true}, + }, + }, + }, + want: `{"health":"healthy"}`, + }, + { + name: "running_one_not_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: false}, + }, + }, + }, + want: `{"health":"degraded"}`, + }, + { + name: "failed", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "pending", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodPending}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "running_with_issues_and_managedby", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + }, + }, + }, + opts: SummaryOptions{ + ManagedBy: ManagedByFromOwner("ReplicaSet", "apps", "prod", "api-7d5"), + IssueCount: 2, + }, + want: `{"managedBy":{"kind":"ReplicaSet","source":"native","name":"api-7d5","namespace":"prod"},"health":"healthy","issueCount":2}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := BuildSummary(c.pod, c.opts) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, err := json.Marshal(got) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_DeploymentReplicasHealth covers the replica-driven +// health heuristic across the Deployment cases. +func TestBuildSummary_DeploymentReplicasHealth(t *testing.T) { + cases := []struct { + name string + ready int32 + desired int32 + wantSlice []byte // JSON of BuildSummary output + }{ + {"all_ready", 3, 3, []byte(`{"health":"healthy"}`)}, + {"none_ready", 0, 3, []byte(`{"health":"unhealthy"}`)}, + {"partial", 1, 3, []byte(`{"health":"degraded"}`)}, + {"scaled_to_zero", 0, 0, nil}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + desired := c.desired + dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &desired, // desired is Spec.Replicas (not Status) — see deriveHealth + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: c.ready, + // Status.Replicas mirrors the actual non-terminated pod count + // in real clusters; we set it equal to ready here so the + // fixture matches a steady-state Deployment for that test. + Replicas: c.ready, + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if c.wantSlice == nil { + if got != nil { + t.Fatalf("got %#v, want nil", got) + } + return + } + if got == nil { + t.Fatalf("got nil, want %s", c.wantSlice) + } + b, _ := json.Marshal(got) + if string(b) != string(c.wantSlice) { + t.Errorf("got %s\nwant %s", b, c.wantSlice) + } + }) + } +} + +// TestBuildSummary_DeploymentHealthDuringScaleDown pins the Spec-vs-Status +// regression flagged on PR #722: during rolling updates or scale-down, +// Status.Replicas (current pod count) can exceed Spec.Replicas (desired). +// Before the fix, deriveHealth compared ReadyReplicas against Status.Replicas +// and reported "degraded" because not all current pods were ready — even +// though all DESIRED replicas were ready and the cluster was healthily +// draining excess pods. Use Spec.Replicas as the denominator instead. +func TestBuildSummary_DeploymentHealthDuringScaleDown(t *testing.T) { + desired := int32(2) + dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{Replicas: &desired}, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 2, // all DESIRED replicas are ready + Replicas: 4, // but 2 extras still terminating from a scale-down + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if got == nil { + t.Fatal("got nil, want ResourceSummaryContext with health=healthy") + } + if got.Health != "healthy" { + t.Errorf("Health = %q, want %q (Spec.Replicas=2 ready, Status.Replicas=4 due to draining)", got.Health, "healthy") + } +} + +// TestBuildSummary_ReplicaSetHealthDuringScaleDown pins the same fix for +// ReplicaSet — the Deployment regression also applied here. +func TestBuildSummary_ReplicaSetHealthDuringScaleDown(t *testing.T) { + desired := int32(3) + rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{Replicas: &desired}, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 3, + Replicas: 5, + }, + } + got := BuildSummary(rs, SummaryOptions{}) + if got == nil || got.Health != "healthy" { + t.Errorf("ReplicaSet during scale-down: got %+v, want Health=healthy", got) + } +} + +// TestBuildSummary_NetworkPolicy verifies BuildSummary handles a kind +// without a health heuristic — it should only emit fields the caller +// supplied (e.g. issueCount, managedBy) and skip health entirely. +func TestBuildSummary_NetworkPolicy(t *testing.T) { + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "deny-all", Namespace: "prod"}, + } + // Empty opts → nil; the kind has no health heuristic so no field is set. + if got := BuildSummary(np, SummaryOptions{}); got != nil { + t.Fatalf("got %#v, want nil", got) + } + // IssueCount only → summary with just issueCount. + got := BuildSummary(np, SummaryOptions{IssueCount: 3}) + if got == nil { + t.Fatalf("got nil, want summary with issueCount") + } + b, _ := json.Marshal(got) + want := `{"issueCount":3}` + if string(b) != want { + t.Errorf("got %s\nwant %s", b, want) + } +} + +// TestBuildSummary_UnstructuredReadyCondition covers the CRD fallback +// — Ready/Available conditions are translated to the health vocabulary. +func TestBuildSummary_UnstructuredReadyCondition(t *testing.T) { + cases := []struct { + name string + conditions []any + want string + }{ + { + name: "ready_true", + conditions: []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + { + name: "ready_false", + conditions: []any{ + map[string]any{"type": "Ready", "status": "False"}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "ready_unknown", + conditions: []any{ + map[string]any{"type": "Ready", "status": "Unknown"}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "available_true", + conditions: []any{ + map[string]any{"type": "Available", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + u := &unstructured.Unstructured{Object: map[string]any{ + "apiVersion": "example.io/v1", + "kind": "Widget", + "status": map[string]any{"conditions": c.conditions}, + }} + got := BuildSummary(u, SummaryOptions{}) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, _ := json.Marshal(got) + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_HealthOverride pins that caller-supplied Health +// short-circuits the per-kind heuristic. +func TestBuildSummary_HealthOverride(t *testing.T) { + dep := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ReadyReplicas: 3, Replicas: 3}, + } + got := BuildSummary(dep, SummaryOptions{Health: "degraded"}) + if got == nil || got.Health != "degraded" { + t.Fatalf("Health override ignored: %#v", got) + } +} + +// TestManagedByFromOwner pins source classification for each cluster +// of owner kinds we care about. +func TestManagedByFromOwner(t *testing.T) { + cases := []struct { + name string + kind string + group string + namespace string + ownerName string + want *ManagedByRef + }{ + { + name: "empty_kind", + kind: "", + ownerName: "x", + want: nil, + }, + { + name: "empty_name", + kind: "Deployment", + ownerName: "", + want: nil, + }, + { + name: "deployment", + kind: "Deployment", + group: "apps", + namespace: "prod", + ownerName: "api", + want: &ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + }, + { + name: "argocd_application", + kind: "Application", + group: "argoproj.io", + namespace: "argocd", + ownerName: "storefront", + want: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, + }, + { + name: "flux_kustomization", + kind: "Kustomization", + group: "kustomize.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_helmrelease", + kind: "HelmRelease", + group: "helm.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_gitrepository", + kind: "GitRepository", + group: "source.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "repo", + want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, + }, + { + // Native Helm release: topology's detectManagedByFromMeta emits + // {Kind:"HelmRelease", Group:""} when it sees Helm's release-name + // annotation (no Flux/GitOps signal). Must classify as "helm", + // not "native" — distinguishes Helm-managed resources in the + // list/search UI from raw kubectl-applied ones. The Flux + // HelmRelease CR lives at helm.toolkit.fluxcd.io and is covered + // by the case above; the empty-group form is unambiguous. + name: "native_helm_release", + kind: "HelmRelease", + group: "", + namespace: "cert-manager", + ownerName: "cert-manager", + want: &ManagedByRef{Kind: "HelmRelease", Source: "helm", Name: "cert-manager", Namespace: "cert-manager"}, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := ManagedByFromOwner(c.kind, c.group, c.namespace, c.ownerName) + if !reflect.DeepEqual(got, c.want) { + t.Errorf("ManagedByFromOwner(%q, %q, %q, %q) = %#v, want %#v", + c.kind, c.group, c.namespace, c.ownerName, got, c.want) + } + }) + } +} + +// TestBuildSummary_NilObject defends against the typed-nil-in-interface +// trap: handlers occasionally pass interface-wrapped nils. +func TestBuildSummary_NilObject(t *testing.T) { + var obj runtime.Object + if got := BuildSummary(obj, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(nil) = %#v, want nil", got) + } + // IssueCount alone still produces output (no panic via nil obj). + got := BuildSummary(obj, SummaryOptions{IssueCount: 1}) + if got == nil || got.IssueCount != 1 { + t.Fatalf("BuildSummary(nil, IssueCount=1) = %#v, want issueCount=1", got) + } +} diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 15f5cfee7..834c0508d 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -20,23 +20,35 @@ package resourcecontext // response. Every field is optional; the zero value is a valid (empty) // "basic"-tier context. // -// Hints is an optional, presentation-only field — populated by AI-facing -// callers (MCP, /api/ai/*) and omitted by UI-facing callers. The structured -// fields above are the canonical facts; hints are a derived prose -// projection. +// All fields are structured. A prose `Hints []string` projection was +// considered (and prototyped) but cut from v1: our dominant agent consumer +// composes triage prose from the structured fields itself, the additional +// wire bytes earned no net signal, and once shipped, agents pattern-matching +// on hint substrings would have ossified the wording. If a real consumer +// emerges that needs deterministic prose, add it as a separate +// `explain_resource` tool rather than re-introducing it inline here. type ResourceContext struct { - Tier ContextTier `json:"tier"` - ManagedBy []ContextRef `json:"managedBy,omitempty"` - Exposes []ContextRef `json:"exposes,omitempty"` - SelectedBy []ContextRef `json:"selectedBy,omitempty"` - Uses *UsesBlock `json:"uses,omitempty"` - RunsOn *ContextRef `json:"runsOn,omitempty"` - ScaledBy []ContextRef `json:"scaledBy,omitempty"` - IssueSummary *IssueSummary `json:"issueSummary,omitempty"` - AuditSummary *AuditSummary `json:"auditSummary,omitempty"` - PolicySummary *PolicySummary `json:"policySummary,omitempty"` - Hints []string `json:"hints,omitempty"` - Omitted []OmittedField `json:"omitted,omitempty"` + Tier ContextTier `json:"tier"` + Owner *ContextRef `json:"owner,omitempty"` + ManagedBy []ContextRef `json:"managedBy,omitempty"` + Exposes []ContextRef `json:"exposes,omitempty"` + SelectedBy []ContextRef `json:"selectedBy,omitempty"` + Uses *UsesBlock `json:"uses,omitempty"` + RunsOn *ContextRef `json:"runsOn,omitempty"` + ScaledBy []ContextRef `json:"scaledBy,omitempty"` + StatusSummary *StatusSummary `json:"statusSummary,omitempty"` + PodSummary *PodSummary `json:"podSummary,omitempty"` + WorkloadSummary *WorkloadSummary `json:"workloadSummary,omitempty"` + ServiceSummary *ServiceSummary `json:"serviceSummary,omitempty"` + IngressSummary *IngressSummary `json:"ingressSummary,omitempty"` + NodeSummary *NodeSummary `json:"nodeSummary,omitempty"` + PVCSummary *PVCSummary `json:"pvcSummary,omitempty"` + JobSummary *JobSummary `json:"jobSummary,omitempty"` + CronJobSummary *CronJobSummary `json:"cronJobSummary,omitempty"` + IssueSummary *IssueSummary `json:"issueSummary,omitempty"` + AuditSummary *AuditSummary `json:"auditSummary,omitempty"` + PolicySummary *PolicySummary `json:"policySummary,omitempty"` + Omitted []OmittedField `json:"omitted,omitempty"` } // ContextTier signals how much enrichment is included. "basic" is the @@ -62,20 +74,23 @@ type ContextRef struct { } // ManagedByRef is the compact form of a "managed-by" pointer used in -// SummaryContext (list/search rows). Carries Kind alongside Source so +// ResourceSummaryContext (list/search rows). Carries Kind alongside Source so // consumers can distinguish e.g. a Flux Kustomization from a Flux // HelmRelease without re-parsing the Source string. Intentionally lacks // Group to keep per-row bytes minimal. type ManagedByRef struct { - Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … - Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" + Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … + Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" Name string `json:"name"` Namespace string `json:"namespace,omitempty"` } -// SummaryContext is the per-row enrichment attached to list_resources -// and search hits. Always-on, intentionally minimal (≤ ~60 bytes). -type SummaryContext struct { +// ResourceSummaryContext is the per-row enrichment attached to +// list_resources and search hits. The row-tier companion to +// ResourceContext (the detail-tier enrichment on GET responses) — +// optimised for bulk triage on lists at ≤ ~60 bytes per row. Always-on +// when the caller didn't opt out via context=none. +type ResourceSummaryContext struct { ManagedBy *ManagedByRef `json:"managedBy,omitempty"` Health string `json:"health,omitempty"` IssueCount int `json:"issueCount,omitempty"` @@ -90,6 +105,157 @@ type UsesBlock struct { PVCs []ContextRef `json:"pvcs,omitempty"` } +// StatusSummary is the generic, deterministic status projection used for +// built-ins and CRDs. It intentionally carries raw condition facts rather than +// prose conclusions. +type StatusSummary struct { + Phase string `json:"phase,omitempty"` + Conditions []ConditionSummary `json:"conditions,omitempty"` +} + +type ConditionSummary struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason,omitempty"` + Message string `json:"message,omitempty"` + LastTransitionTime string `json:"lastTransitionTime,omitempty"` +} + +type PodSummary struct { + Phase string `json:"phase,omitempty"` + Ready bool `json:"ready"` + RestartCount int32 `json:"restartCount,omitempty"` + Containers []ContainerStateSummary `json:"containers,omitempty"` +} + +type ContainerStateSummary struct { + Name string `json:"name"` + Ready bool `json:"ready"` + RestartCount int32 `json:"restartCount,omitempty"` + State string `json:"state,omitempty"` + Reason string `json:"reason,omitempty"` + LastTerminationReason string `json:"lastTerminationReason,omitempty"` +} + +type WorkloadSummary struct { + Replicas *ReplicaSummary `json:"replicas,omitempty"` + Conditions []ConditionSummary `json:"conditions,omitempty"` +} + +type ReplicaSummary struct { + Desired int32 `json:"desired,omitempty"` + Ready int32 `json:"ready,omitempty"` + Available int32 `json:"available,omitempty"` + Updated int32 `json:"updated,omitempty"` + Unavailable int32 `json:"unavailable,omitempty"` +} + +// ServiceSummary adds realized backend state for a Service. The raw Service +// spec already contains type/ports/selector; this block focuses on facts that +// require looking at related resources. +type ServiceSummary struct { + SelectedPods *PodSelectionSummary `json:"selectedPods,omitempty"` + Warnings []ServiceWarning `json:"warnings,omitempty"` +} + +type PodSelectionSummary struct { + Total int `json:"total"` + Ready int `json:"ready"` + NotReady int `json:"notReady,omitempty"` + ReadyPods []ContextRef `json:"readyPods,omitempty"` + NotReadyPods []ContextRef `json:"notReadyPods,omitempty"` + Truncated bool `json:"truncated,omitempty"` +} + +type ServiceWarning string + +const ( + ServiceWarningNoSelector ServiceWarning = "no_selector" + ServiceWarningNoSelectedPods ServiceWarning = "no_selected_pods" + ServiceWarningNoReadyPods ServiceWarning = "no_ready_pods" +) + +type IngressSummary struct { + Class string `json:"class,omitempty"` + Addresses []string `json:"addresses,omitempty"` + BackendServices []ContextRef `json:"backendServices,omitempty"` + TLSSecrets []ContextRef `json:"tlsSecrets,omitempty"` + Warnings []IngressWarning `json:"warnings,omitempty"` +} + +type IngressWarning string + +const ( + IngressWarningNoAddress IngressWarning = "no_address" + IngressWarningNoClass IngressWarning = "no_class" + IngressWarningNoRules IngressWarning = "no_rules" +) + +type NodeSummary struct { + ReadyStatus string `json:"readyStatus,omitempty"` + Unschedulable bool `json:"unschedulable,omitempty"` + Capacity map[string]string `json:"capacity,omitempty"` + Allocatable map[string]string `json:"allocatable,omitempty"` + Taints []TaintSummary `json:"taints,omitempty"` + Warnings []NodeWarning `json:"warnings,omitempty"` +} + +type TaintSummary struct { + Key string `json:"key"` + Value string `json:"value,omitempty"` + Effect string `json:"effect"` +} + +type NodeWarning string + +const ( + NodeWarningUnschedulable NodeWarning = "unschedulable" + NodeWarningNotReady NodeWarning = "not_ready" + NodeWarningDiskPressure NodeWarning = "disk_pressure" + NodeWarningMemoryPressure NodeWarning = "memory_pressure" + NodeWarningPIDPressure NodeWarning = "pid_pressure" + NodeWarningNetworkUnavailable NodeWarning = "network_unavailable" +) + +type PVCSummary struct { + Phase string `json:"phase,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + VolumeName string `json:"volumeName,omitempty"` + RequestedStorage string `json:"requestedStorage,omitempty"` + CapacityStorage string `json:"capacityStorage,omitempty"` + AccessModes []string `json:"accessModes,omitempty"` + VolumeMode string `json:"volumeMode,omitempty"` + Provisioner string `json:"provisioner,omitempty"` + SelectedNode string `json:"selectedNode,omitempty"` + BindCompleted string `json:"bindCompleted,omitempty"` + Warnings []PVCWarning `json:"warnings,omitempty"` +} + +type PVCWarning string + +const ( + PVCWarningPending PVCWarning = "pending" + PVCWarningLost PVCWarning = "lost" +) + +type JobSummary struct { + Active int32 `json:"active,omitempty"` + Succeeded int32 `json:"succeeded,omitempty"` + Failed int32 `json:"failed,omitempty"` + Completions int32 `json:"completions,omitempty"` + Parallelism int32 `json:"parallelism,omitempty"` + BackoffLimit int32 `json:"backoffLimit,omitempty"` + Suspended bool `json:"suspended,omitempty"` +} + +type CronJobSummary struct { + Schedule string `json:"schedule,omitempty"` + Suspended bool `json:"suspended,omitempty"` + ActiveJobs []ContextRef `json:"activeJobs,omitempty"` + LastScheduleTime string `json:"lastScheduleTime,omitempty"` + LastSuccessfulTime string `json:"lastSuccessfulTime,omitempty"` +} + // IssueSummary is a rollup of internal issue-engine findings scoped to // the subject resource. Pre-computed by callers and passed into the // generator — this package does not import internal/issues. diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index 2e891594c..ccb22f389 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -105,7 +105,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { IssueSummary: &IssueSummary{Count: 1}, AuditSummary: &AuditSummary{Count: 2}, PolicySummary: &PolicySummary{}, - Hints: []string{"hint"}, Omitted: []OmittedField{{Field: "selectedBy", Reason: OmittedRBACDenied}}, } b, err := json.Marshal(ac) @@ -124,7 +123,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { `"issueSummary"`, `"auditSummary"`, `"policySummary"`, - `"hints"`, `"omitted"`, } prev := -1 @@ -197,7 +195,6 @@ func TestResourceContextRoundTrip(t *testing.T) { }}, }, }, - Hints: []string{"Managed by Deployment api"}, Omitted: []OmittedField{ {Field: "selectedBy.networkPolicies", Reason: OmittedRBACDenied}, {Field: "policySummary.kyverno", Reason: OmittedNotInstalled}, @@ -217,10 +214,10 @@ func TestResourceContextRoundTrip(t *testing.T) { } } -// TestSummaryContextRoundTrip covers SummaryContext + ManagedByRef +// TestResourceSummaryContextRoundTrip covers ResourceSummaryContext + ManagedByRef // which are not embedded in ResourceContext. -func TestSummaryContextRoundTrip(t *testing.T) { - orig := SummaryContext{ +func TestResourceSummaryContextRoundTrip(t *testing.T) { + orig := ResourceSummaryContext{ ManagedBy: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, Health: "degraded", IssueCount: 2, @@ -229,7 +226,7 @@ func TestSummaryContextRoundTrip(t *testing.T) { if err != nil { t.Fatalf("marshal: %v", err) } - var got SummaryContext + var got ResourceSummaryContext if err := json.Unmarshal(b, &got); err != nil { t.Fatalf("unmarshal: %v", err) } @@ -244,12 +241,12 @@ func TestSummaryContextRoundTrip(t *testing.T) { s := string(b) for _, sub := range wantSubstr { if !strings.Contains(s, sub) { - t.Errorf("SummaryContext JSON missing %s: %s", sub, s) + t.Errorf("ResourceSummaryContext JSON missing %s: %s", sub, s) } } for _, forbidden := range []string{`"group"`} { if strings.Contains(s, forbidden) { - t.Errorf("SummaryContext JSON leaks %s: %s", forbidden, s) + t.Errorf("ResourceSummaryContext JSON leaks %s: %s", forbidden, s) } } } @@ -258,8 +255,8 @@ func TestSummaryContextRoundTrip(t *testing.T) { // without it, Flux Kustomization vs HelmRelease serialize to identical // JSON, forcing consumers to parse the Source string. func TestManagedByRefDistinguishesFluxKinds(t *testing.T) { - kustomization := SummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} - helmRelease := SummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + kustomization := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + helmRelease := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} kJSON, _ := json.Marshal(kustomization) hJSON, _ := json.Marshal(helmRelease) diff --git a/pkg/topology/pseudokinds.go b/pkg/topology/pseudokinds.go new file mode 100644 index 000000000..f1f8e500f --- /dev/null +++ b/pkg/topology/pseudokinds.go @@ -0,0 +1,48 @@ +package topology + +// KindForGVK maps a (kind, group) pair to the topology-internal pseudo-kind +// the builder uses for node IDs. The topology builder synthesizes pseudo-kinds +// for a handful of CRDs whose Kind collides with a core kind under a different +// API group — these collisions would otherwise produce ambiguous node IDs. +// +// Callers that already hold the resource's apiVersion (i.e., obj.GVK) and want +// to look up the matching topology node MUST funnel kind through this helper, +// otherwise buildNodeID would resolve to the core node and return relationships +// for the wrong object. +// +// Today the cross-group collisions are: +// +// serving.knative.dev/Service → "knativeservice" +// serving.knative.dev/Configuration → "knativeconfiguration" +// serving.knative.dev/Revision → "knativerevision" +// serving.knative.dev/Route → "knativeroute" +// cluster.x-k8s.io/Cluster → "capicluster" +// networking.istio.io/Gateway → "istiogateway" +// +// For any other (kind, group) pair — including core kinds with group=="" and +// non-colliding CRDs — KindForGVK returns kind unchanged. buildNodeID's own +// kindMap then handles URL-plural-to-singular flattening. +func KindForGVK(kind, group string) string { + switch group { + case "serving.knative.dev": + switch kind { + case "Service": + return "knativeservice" + case "Configuration": + return "knativeconfiguration" + case "Revision": + return "knativerevision" + case "Route": + return "knativeroute" + } + case "cluster.x-k8s.io": + if kind == "Cluster" { + return "capicluster" + } + case "networking.istio.io": + if kind == "Gateway" { + return "istiogateway" + } + } + return kind +} diff --git a/pkg/topology/pseudokinds_test.go b/pkg/topology/pseudokinds_test.go new file mode 100644 index 000000000..91786564e --- /dev/null +++ b/pkg/topology/pseudokinds_test.go @@ -0,0 +1,61 @@ +package topology + +import "testing" + +// KindForGVK is the bridge between (obj.Kind, obj.Group) and the topology +// builder's pseudo-kind node-ID prefix. The builder emits pseudo-kinds for +// CRDs whose Kind collides with a core kind under a different group +// (Knative Service vs core Service, CAPI Cluster vs… nothing today but a +// future "Cluster" core kind, Istio Gateway vs Gateway API Gateway). +// +// A regression in this helper silently routes single-resource relationship +// lookups for those CRDs to the wrong topology node, so the table covers +// every group remapping plus the pass-through cases. +func TestKindForGVK(t *testing.T) { + tests := []struct { + name string + kind string + group string + want string + }{ + // Knative Serving collisions. + {"knative service", "Service", "serving.knative.dev", "knativeservice"}, + {"knative configuration", "Configuration", "serving.knative.dev", "knativeconfiguration"}, + {"knative revision", "Revision", "serving.knative.dev", "knativerevision"}, + {"knative route", "Route", "serving.knative.dev", "knativeroute"}, + // CAPI collision (Cluster, distinct from any future "Cluster" core kind). + {"capi cluster", "Cluster", "cluster.x-k8s.io", "capicluster"}, + // Istio Gateway collision (vs Gateway API's gateway.networking.k8s.io/Gateway). + {"istio gateway", "Gateway", "networking.istio.io", "istiogateway"}, + + // Pass-through: core kinds (group == ""). + {"core service passthrough", "Service", "", "Service"}, + {"core pod passthrough", "Pod", "", "Pod"}, + // Pass-through: apps group. + {"apps deployment passthrough", "Deployment", "apps", "Deployment"}, + {"batch job passthrough", "Job", "batch", "Job"}, + // Pass-through: Gateway API (uses the gateway.networking.k8s.io group, + // distinct from networking.istio.io — must NOT be remapped to istiogateway). + {"gateway api gateway passthrough", "Gateway", "gateway.networking.k8s.io", "Gateway"}, + // Pass-through: non-colliding CRDs. + {"argo application passthrough", "Application", "argoproj.io", "Application"}, + {"cert-manager certificate passthrough", "Certificate", "cert-manager.io", "Certificate"}, + // Pass-through: a Kind that matches a Knative collision but under the + // wrong group must NOT remap. Guards against accidental kind-only + // matching that would mis-classify e.g. core Route or future CRDs. + {"route under wrong group", "Route", "route.openshift.io", "Route"}, + {"service under wrong group", "Service", "argoproj.io", "Service"}, + // Empty kind: pass-through (caller's problem to validate). + {"empty kind", "", "serving.knative.dev", ""}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := KindForGVK(tc.kind, tc.group) + if got != tc.want { + t.Errorf("KindForGVK(%q, %q) = %q, want %q", tc.kind, tc.group, got, tc.want) + } + }) + } +}