diff --git a/internal/issues/issues.go b/internal/issues/issues.go index 4a3764dae..5bb4f6279 100644 --- a/internal/issues/issues.go +++ b/internal/issues/issues.go @@ -74,10 +74,17 @@ func Compose(p Provider, f Filters) []Issue { // severity desc, then last-seen desc, then kind/ns/name for stable // tiebreaks. func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { + // Negative Limit is the "uncapped" sentinel: callers that need the + // full matched set (per-resource issue indexes for /api/ai list + + // search summaryContext) pass NoLimit so a 5000-issue cluster + // doesn't silently drop counts for resources whose issues fall in + // the tail beyond MaxLimit. Zero still maps to DefaultLimit so the + // public /api/issues + MCP issues_list keep their tight caps. + uncapped := f.Limit < 0 if f.Limit == 0 { f.Limit = DefaultLimit } - if f.Limit > MaxLimit { + if !uncapped && f.Limit > MaxLimit { f.Limit = MaxLimit } @@ -201,7 +208,7 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { return out[i].Name < out[j].Name }) stats.TotalMatched = len(out) - if len(out) > f.Limit { + if !uncapped && len(out) > f.Limit { out = out[:f.Limit] } return out, stats @@ -211,11 +218,24 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) { // warning Issue for each object that has a False Ready/Available/etc. // condition. Skips kinds owned by curated checkers (Cluster API today) // to avoid double-reporting. +// +// When f.Kinds is non-empty (e.g. summaryContext building a per-resource +// issue index for a list_resources call on a single kind), GVRs whose +// kind isn't in the filter are skipped BEFORE the ListDynamic call — +// without this gate, a pods-only request still scanned every watched +// CRD up front and applyFilters discarded the rows afterward. Kind +// comparison mirrors applyFilters: lowercase for case-insensitive +// match against the user's filter (which itself is canonicalized to +// the singular form upstream). func detectGenericCRDIssues(p Provider, f Filters) []Issue { gvrs := p.WatchedDynamic() if len(gvrs) == 0 { return nil } + wantKind := map[string]bool{} + for _, k := range f.Kinds { + wantKind[strings.ToLower(k)] = true + } var out []Issue for _, gvr := range gvrs { if isCuratedCRDGroup(gvr.Group) { @@ -225,6 +245,15 @@ func detectGenericCRDIssues(p Provider, f Filters) []Issue { if kind == "" { continue } + // applyFilters runs after Compose returns — but on hot paths that + // pin a single kind (summaryContext per-row index), routing the + // kind filter through here skips the per-GVR ListDynamic call + // entirely. Match in lowercase (same as applyFilters) so + // "Pod"/"pod" and CRD-typed "MyResource"/"myresource" both + // compare equal. + if len(wantKind) > 0 && !wantKind[strings.ToLower(kind)] { + continue + } clusterScoped, _, _ := classifyDynamicScope(p, gvr, kind) if clusterScoped && f.CanReadClusterScoped != nil && !f.CanReadClusterScoped(kind, gvr.Group) { continue diff --git a/internal/issues/issues_test.go b/internal/issues/issues_test.go index 2e774746a..beb995725 100644 --- a/internal/issues/issues_test.go +++ b/internal/issues/issues_test.go @@ -622,3 +622,87 @@ func TestFlattenNamespacedProblems_EmptyInputReturnsNil(t *testing.T) { t.Errorf("empty input should produce empty output, got %+v", out) } } + +// countingProvider wraps fakeProvider and tallies ListDynamic calls per +// GVR. Used by TestDetectGenericCRDIssues_SkipsListWhenKindFiltered to +// pin that detectGenericCRDIssues short-circuits the per-GVR +// ListDynamic call when f.Kinds excludes the GVR's kind — on clusters +// with hundreds of watched CRDs, scanning every one for a pods-only +// summaryContext request was the dominant cost. +type countingProvider struct { + fakeProvider + listCalls map[schema.GroupVersionResource]int +} + +func (c *countingProvider) ListDynamic(gvr schema.GroupVersionResource, ns string) ([]*unstructured.Unstructured, error) { + if c.listCalls == nil { + c.listCalls = map[schema.GroupVersionResource]int{} + } + c.listCalls[gvr]++ + return c.fakeProvider.ListDynamic(gvr, ns) +} + +// TestDetectGenericCRDIssues_SkipsListWhenKindFiltered pins the +// "scan all CRDs before kindFilter applies" perf fix in +// detectGenericCRDIssues. Pre-fix, a Compose call with Kinds=["Pod"] +// still iterated every watched CRD GVR and ran ListDynamic on each; +// applyFilters then discarded the non-matching rows at the end. +// +// On a cluster with hundreds of watched CRDs this dominated the +// summaryContext per-row index build for list_resources kind=pods. +// The fix routes f.Kinds awareness into detectGenericCRDIssues so +// non-matching GVRs skip the ListDynamic call entirely. +func TestDetectGenericCRDIssues_SkipsListWhenKindFiltered(t *testing.T) { + podGVR := schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} + appGVR := schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "applications"} + npGVR := schema.GroupVersionResource{Group: "karpenter.sh", Version: "v1", Resource: "nodepools"} + + p := &countingProvider{ + fakeProvider: fakeProvider{ + dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{ + podGVR: {}, // empty — only counts the call. + appGVR: {{Object: map[string]any{ + "metadata": map[string]any{"name": "a", "namespace": "argocd"}, + "status": map[string]any{ + "conditions": []any{ + map[string]any{"type": "Synced", "status": "False", "reason": "Drift"}, + }, + }, + }}}, + npGVR: {}, // empty — only counts the call. + }, + kinds: map[schema.GroupVersionResource]string{ + podGVR: "Pod", + appGVR: "Application", + npGVR: "NodePool", + }, + }, + } + + // kindFilter restricts to Application — the other two GVRs must NOT + // be listed. detectGenericCRDIssues lowercases the kind comparison + // (mirrors applyFilters), so the canonical "Application" matches the + // emitted Kind for the argoproj.io GVR. + _ = detectGenericCRDIssues(p, Filters{Kinds: []string{"Application"}}) + + if got := p.listCalls[podGVR]; got != 0 { + t.Errorf("Pod GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[npGVR]; got != 0 { + t.Errorf("NodePool GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got) + } + if got := p.listCalls[appGVR]; got == 0 { + t.Errorf("Application GVR ListDynamic calls = %d, want >= 1 (matching kind must still be scanned)", got) + } + + // Sanity: empty Kinds filter scans every GVR (no per-kind shortcut + // when caller didn't ask for one). Pins that the fix is filter-aware + // rather than always-skip. + p.listCalls = nil + _ = detectGenericCRDIssues(p, Filters{}) + for gvr, want := range map[schema.GroupVersionResource]bool{podGVR: true, appGVR: true, npGVR: true} { + if got := p.listCalls[gvr] > 0; got != want { + t.Errorf("no kind filter: GVR %s called=%v, want %v", gvr.Resource, got, want) + } + } +} diff --git a/internal/issues/types.go b/internal/issues/types.go index 11b368433..63b22a38c 100644 --- a/internal/issues/types.go +++ b/internal/issues/types.go @@ -123,4 +123,11 @@ type Filters struct { const ( DefaultLimit = 200 MaxLimit = 1000 + // NoLimit disables the result cap. Pass as Filters.Limit when the + // caller needs the full matched set (e.g. building a per-resource + // issue index for summaryContext — capping there would silently zero + // out counts for resources whose issues fall in the tail beyond + // MaxLimit on large clusters). Stats.TotalMatched is reliable + // regardless; this just turns off the post-sort slice. + NoLimit = -1 ) diff --git a/internal/k8s/problems.go b/internal/k8s/problems.go index f5e19d452..766f98f34 100644 --- a/internal/k8s/problems.go +++ b/internal/k8s/problems.go @@ -59,6 +59,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d available", d.Status.AvailableReplicas, d.Status.Replicas), Age: FormatAge(ageDur), @@ -78,6 +79,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Deployment", Namespace: d.Namespace, Name: d.Name, + Group: "apps", Severity: "critical", Reason: "Rollout stuck", Message: cond.Message, @@ -107,6 +109,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "StatefulSet", Namespace: ss.Namespace, Name: ss.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d/%d ready", ss.Status.ReadyReplicas, ss.Status.Replicas), Age: FormatAge(ageDur), @@ -133,6 +136,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "DaemonSet", Namespace: ds.Namespace, Name: ds.Name, + Group: "apps", Severity: "critical", Reason: fmt.Sprintf("%d unavailable", ds.Status.NumberUnavailable), Age: FormatAge(ageDur), @@ -157,6 +161,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "HorizontalPodAutoscaler", Namespace: hp.Namespace, Name: hp.Name, + Group: "autoscaling", Severity: "medium", Reason: hp.Problem, Message: hp.Reason, @@ -177,6 +182,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "CronJob", Namespace: cp.Namespace, Name: cp.Name, + Group: "batch", Severity: "medium", Reason: cp.Problem, Message: cp.Reason, @@ -251,6 +257,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem { Kind: "Job", Namespace: job.Namespace, Name: job.Name, + Group: "batch", Severity: "high", Reason: fmt.Sprintf("Running for %s with no completions", FormatAge(ageDur)), Age: FormatAge(ageDur), diff --git a/internal/k8s/problems_test.go b/internal/k8s/problems_test.go new file mode 100644 index 000000000..779b74f4f --- /dev/null +++ b/internal/k8s/problems_test.go @@ -0,0 +1,144 @@ +package k8s + +import ( + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// TestDetectProblems_PopulatesGroup pins that every built-in Problem +// emitted by DetectProblems carries the correct canonical API group. +// +// The summary_context issue index keys per-resource counts as +// "group|kind|ns|name" — a Problem with an empty Group collides with +// no real bucket, silently zeroing issueCount for that workload row. +// Pre-fix, all the built-in append-Problem sites omitted the field, so +// every broken Deployment/StatefulSet/DaemonSet/HPA/CronJob/Job +// reported issueCount: 0 in the AI list envelope — a regression +// against the pre-group-aware behavior. +// +// Construct one broken object per built-in kind, drive DetectProblems +// against a fake client, and assert each emitted Problem's Group +// matches the canonical group for its kind. +func TestDetectProblems_PopulatesGroup(t *testing.T) { + defer ResetTestState() + + oneReplica := int32(1) + minReplicas := int32(1) + now := time.Now() + // Job needs to be older than 1h to surface a "stuck" problem. + jobStart := metav1.NewTime(now.Add(-2 * time.Hour)) + + client := fake.NewClientset( + // Deployment with unavailable replicas — triggers the + // "X/Y available" Problem branch. + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: appsv1.DeploymentSpec{Replicas: &oneReplica}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UnavailableReplicas: 1, + }, + }, + // StatefulSet with readyReplicas < replicas. + &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "prod"}, + Spec: appsv1.StatefulSetSpec{Replicas: &oneReplica}, + Status: appsv1.StatefulSetStatus{ + Replicas: 1, + ReadyReplicas: 0, + }, + }, + // DaemonSet with numberUnavailable > 0. + &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Name: "logger", Namespace: "prod"}, + Status: appsv1.DaemonSetStatus{ + NumberUnavailable: 2, + }, + }, + // HPA at its replica ceiling — DetectHPAProblems flags + // "maxed" when current and desired both hit MaxReplicas. + // The wrapper sets Group="autoscaling". + &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MinReplicas: &minReplicas, + MaxReplicas: 10, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + }, + }, + // Job stuck Active>0 for >1h with no completions. + &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod", CreationTimestamp: jobStart}, + Status: batchv1.JobStatus{ + Active: 1, + Succeeded: 0, + Failed: 0, + }, + }, + ) + + if err := InitTestResourceCache(client); err != nil { + t.Fatalf("InitTestResourceCache: %v", err) + } + cache := GetResourceCache() + if cache == nil { + t.Fatal("cache nil after init") + } + + // Allow informers a brief moment to populate. The fake clientset + // pre-seeds the store, but the lister types reconstruct via + // informer events on a separate goroutine. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if hasAllProblemTypes(DetectProblems(cache, "prod")) { + break + } + time.Sleep(20 * time.Millisecond) + } + + problems := DetectProblems(cache, "prod") + + wantGroup := map[string]string{ + "Deployment": "apps", + "StatefulSet": "apps", + "DaemonSet": "apps", + "HorizontalPodAutoscaler": "autoscaling", + "Job": "batch", + } + + got := make(map[string]string, len(problems)) + for _, p := range problems { + // One Problem per kind is enough for the Group assertion; + // duplicates (e.g. Deployment Available + ProgressDeadline) + // must agree on Group so the last-write-wins shape is fine. + got[p.Kind] = p.Group + } + + for kind, want := range wantGroup { + gotGroup, ok := got[kind] + if !ok { + t.Errorf("no Problem emitted for %s — fixture wiring broken; got %d problems: %+v", kind, len(problems), problems) + continue + } + if gotGroup != want { + t.Errorf("%s.Group = %q, want %q (summary_context index keys by group — empty Group zeros issueCount)", kind, gotGroup, want) + } + } +} + +func hasAllProblemTypes(problems []Problem) bool { + seen := map[string]bool{} + for _, p := range problems { + seen[p.Kind] = true + } + return seen["Deployment"] && seen["StatefulSet"] && seen["DaemonSet"] && seen["HorizontalPodAutoscaler"] && seen["Job"] +} diff --git a/internal/mcp/summary_context.go b/internal/mcp/summary_context.go new file mode 100644 index 000000000..78f36cf0f --- /dev/null +++ b/internal/mcp/summary_context.go @@ -0,0 +1,96 @@ +// Per-request helpers that compute the compact ResourceSummaryContext attached +// to list_resources rows and search hits served via MCP. +// +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the MCP-specific wrapper — it +// sources topology from a short-TTL per-process memoizer (MCP has no +// shared broadcaster cache) and otherwise just plumbs arguments through. + +package mcp + +import ( + "time" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// newResourceSummaryContextBuilder assembles the per-request closure for MCP +// list_resources. Returns nil when the cache or topology isn't +// available, in which case the caller should skip context attachment +// rather than emit empty objects. +// +// namespaces scopes the issue index to just the rows being returned; +// pass nil for cluster-wide. +// +// Use newSearchSummaryContextBuilder for MCP search, which routes +// per-hit between a namespaced and a cluster-wide index — search +// returns mixed kinds in one response, so a single index can't get +// both right. +func newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + idx := summarycontext.BuildIssueIndex(provider, namespaces) + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(namespaces), idx, idx) +} + +// newSearchSummaryContextBuilder is the MCP search variant. Mirrors +// internal/server.newSearchSummaryContextBuilder — see that comment for +// the dual-index rationale (mixed-kind hits, cluster-scoped issues at +// namespace=""). MCP search-level RBAC (CanReadClusterScoped via +// canReadClusterScopedKind) already gates which cluster-scoped kinds +// are reachable, so composing the cluster-wide index doesn't leak +// rows the user can't see. +func newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) + } + return summarycontext.BuilderFromIndexes(buildSummaryContextTopology(scanNamespaces), namespacedIdx, clusterIdx) +} + +// summaryCtxTopoMemo caches topology builds across summary-context list and +// search invocations. MCP has no shared broadcaster cache, so without +// memoization every list_resources / search call from an agent pays a +// full topology build (multi-second on multi-thousand-resource clusters). +// 5s TTL matches the REST broadcaster's cadence — short enough that +// managedBy stays current after a context switch, long enough that a +// burst of agent calls amortizes the build cost. +// +// Other MCP tools (handleGetResource, get_neighborhood) still build +// inline; threading them through here is a separate follow-up. +var summaryCtxTopoMemo = topology.NewMemoizer(5 * time.Second) + +// buildSummaryContextTopology returns a topology snapshot suitable for +// resolving managedBy pointers, reusing a cached snapshot when one is +// fresh. Returns nil on failure — the caller falls back to a +// managedBy-less ResourceSummaryContext rather than failing the response. +func buildSummaryContextTopology(namespaces []string) *topology.Topology { + cache := k8s.GetResourceCache() + if cache == nil { + return nil + } + opts := topology.DefaultBuildOptions() + if len(namespaces) > 0 { + opts.Namespaces = namespaces + } + topo, err := summaryCtxTopoMemo.Get(opts, func() (*topology.Topology, error) { + builder := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery())) + return builder.Build(opts) + }) + if err != nil { + return nil + } + return topo +} diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index d76d9b612..7852a106a 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -12,6 +12,7 @@ import ( "github.com/modelcontextprotocol/go-sdk/mcp" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" "github.com/skyhook-io/radar/internal/filter" @@ -19,6 +20,7 @@ import ( "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" "github.com/skyhook-io/radar/internal/search" + "github.com/skyhook-io/radar/internal/summarycontext" "github.com/skyhook-io/radar/internal/timeline" aicontext "github.com/skyhook-io/radar/pkg/ai/context" topology "github.com/skyhook-io/radar/pkg/topology" @@ -315,6 +317,7 @@ type listResourcesInput struct { Kind string `json:"kind" jsonschema:"resource kind to list, e.g. pods, deployments, services, configmaps"` Group string `json:"group,omitempty" jsonschema:"API group when the kind is ambiguous (e.g. serving.knative.dev for Knative Service vs core Service)"` Namespace string `json:"namespace,omitempty" jsonschema:"filter to a specific namespace"` + Context string `json:"context,omitempty" jsonschema:"per-row context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare rows"` } type getResourceInput struct { @@ -358,6 +361,7 @@ type searchInput struct { Limit int `json:"limit,omitempty" jsonschema:"max hits returned (default 50, max 500)"` Include string `json:"include,omitempty" jsonschema:"per-hit detail: summary (default), raw, or none"` Filter string `json:"filter,omitempty" jsonschema:"optional CEL boolean expression run against each candidate K8s object. Bindings: kind, apiVersion, metadata, spec, status, labels, annotations. Use has(x.y) before optional fields. Examples: 'kind == \"Pod\" && status.phase == \"Failed\"', 'labels[\"app\"] == \"cart\"', 'has(status.readyReplicas) && status.readyReplicas == 0'"` + Context string `json:"context,omitempty" jsonschema:"per-hit context: omit (default) attaches summaryContext (managedBy + health + issueCount) for triage; 'none' returns bare hits"` } type issuesInput struct { @@ -464,13 +468,27 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li listScope = nil } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service) reach the right + // resource. FetchResourceList is group-blind — it would silently + // return the core typed list, dropping the caller's group filter on + // the floor. Mirrors the group-aware short-circuit in REST + // handleAIListResources and handleGetResource (PR #721). + if group != "" { + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, listScope) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs. ClassifyKindScope/SAR // above already authorized cluster-scoped CRDs; namespaced CRDs - // are scoped via listScope. - return listDynamicResources(ctx, cache, kind, group, listScope) + // are scoped via listScope. Pass clusterScoped through so the + // issue index drops the namespace filter for cluster-scoped + // CRDs — those issues live at namespace="" and would otherwise + // be filtered out by the user's namespaced-access set. + return listDynamicResources(ctx, cache, kind, group, listScope, clusterScoped, input.Context) } if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) @@ -492,28 +510,62 @@ func handleListResources(ctx context.Context, req *mcp.CallToolRequest, input li return nil, nil, fmt.Errorf("failed to minify: %w", err) } + // Attach summaryContext per row unless caller opted out. Issue index + // is scoped to the listed kind so the per-row count reflects only + // the resource being listed (not unrelated noise in the namespace). + // + // Cluster-scoped kinds (Node, PV, cluster-scoped CRDs) emit issues + // at namespace="" — scoping the index to the user's namespaced + // access set would silently zero issueCount on every row. The + // cluster-scoped RBAC gate above (canReadClusterScopedKind) already + // authorized the read, so we pass nil here to compose cluster-wide. + if input.Context != "none" { + idxNamespaces := allowed + if clusterScoped { + idxNamespaces = nil + } + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToTypedList(results, objs, builder) + } + } + return toJSONResult(results) } -func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string) (*mcp.CallToolResult, any, error) { - var allItems []any +func listDynamicResources(ctx context.Context, cache *k8s.ResourceCache, kind, group string, namespaces []string, clusterScoped bool, contextMode string) (*mcp.CallToolResult, any, error) { + var rawItems []*unstructured.Unstructured if len(namespaces) > 0 { for _, ns := range namespaces { items, err := cache.ListDynamicWithGroup(ctx, kind, ns, group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) - } + rawItems = append(rawItems, items...) } } else { items, err := cache.ListDynamicWithGroup(ctx, kind, "", group) if err != nil { return nil, nil, fmt.Errorf("failed to list %s: %w", kind, err) } - for _, item := range items { - allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + rawItems = items + } + + allItems := make([]any, 0, len(rawItems)) + for _, item := range rawItems { + allItems = append(allItems, aicontext.MinifyUnstructured(item, aicontext.LevelSummary)) + } + + if contextMode != "none" { + // Cluster-scoped CRDs emit issues at namespace="" — passing a + // namespace-restricted slice would silently zero issueCount on + // every row. Caller has already gated cluster-scoped reads via + // canReadClusterScopedKind, so cluster-wide compose is safe. + idxNamespaces := namespaces + if clusterScoped { + idxNamespaces = nil + } + if builder := newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToUnstructuredList(allItems, rawItems, builder) } } @@ -2048,6 +2100,17 @@ func handleSearch(ctx context.Context, req *mcp.CallToolRequest, input searchInp } opts.Filter = f } + // Search uses the dual-index variant: hits are mixed-kind (a single + // query can return both namespaced Pods and cluster-scoped Nodes), + // so a single namespace-scoped issue index zeroes issueCount on + // cluster-scoped hits whose problems live at namespace="". The + // builder routes per-hit by scope; CanReadClusterScoped above + // already gates which cluster-scoped kinds are reachable. + if input.Context != "none" { + if builder := newSearchSummaryContextBuilder(scanNamespaces); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } result, err := search.Search(ctx, provider, parsed, opts) if err != nil { return nil, nil, err diff --git a/internal/mcp/tools_filter_test.go b/internal/mcp/tools_filter_test.go index 115a8a1dc..20c5c7227 100644 --- a/internal/mcp/tools_filter_test.go +++ b/internal/mcp/tools_filter_test.go @@ -148,6 +148,46 @@ func containsName(payload, name string) bool { return strings.Contains(payload, `"name":"`+name+`"`) } +// TestHandleListResources_GroupRoutesToDynamic pins the group-aware +// short-circuit on the MCP list_resources path. For kind=services with +// no group, the typed core Service list returns the seeded fixture. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// listDynamicResources instead. Mirrors the REST-side fix in +// handleAIListResources and the GET-side fix from PR #721. +// +// setupFakeCacheForFilterTests doesn't initialize the dynamic cache, so +// the dynamic call surfaces an error. listDynamicResources wraps it in +// "failed to list %s: …" — pin both that the result does NOT contain +// the core Service AND that the call returned the dynamic-cache error +// (proving the routing change is in place). +func TestHandleListResources_GroupRoutesToDynamic(t *testing.T) { + setupFakeCacheForFilterTests(t) + ctx := withRestrictedUser(t, "alice", []string{"alpha"}) + + // With no group: typed cache, but no Services in the fixture so + // it's an empty list. Sanity check the baseline. + _, _, err := handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha"}) + if err != nil { + t.Fatalf("baseline (no group): %v", err) + } + + // With group=serving.knative.dev: must route to dynamic. The fake + // cache has no dynamic discovery wired, so we expect an error + // rather than a (wrong) 200 with typed core Services. + _, _, err = handleListResources(ctx, nil, listResourcesInput{Kind: "services", Namespace: "alpha", Group: "serving.knative.dev"}) + if err == nil { + t.Fatalf("group=serving.knative.dev: expected dynamic-cache routing error (no discovery in test harness), got nil err — handler may have silently returned typed core Services (pre-fix bug)") + } + // The wrapped error should reflect the dynamic path, not a typed + // cache lookup. Match loosely on shape so future error-text + // refactors don't flake the test. + if !strings.Contains(err.Error(), "services") { + t.Errorf("error should mention services kind: %v", err) + } +} + func TestHandleListResources_RestrictedUser(t *testing.T) { setupFakeCacheForFilterTests(t) diff --git a/internal/search/search.go b/internal/search/search.go index 4d05f75c7..df7b5da01 100644 --- a/internal/search/search.go +++ b/internal/search/search.go @@ -22,8 +22,23 @@ import ( "github.com/skyhook-io/radar/internal/k8s" aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" ) +// SummaryBuilderFunc, when supplied via Options.SummaryBuilder, is +// invoked once per matched hit to produce the compact SummaryContext +// attached to the hit's summaryContext field. Exactly one of obj/u will be +// non-nil — typed kinds pass obj, dynamic CRDs pass u. Returning nil +// is fine (the field is omitempty); callers use it to gate context +// emission per request (context=none opts out by passing nil here). +// +// group is the candidate's API group (already known to the search +// walker — typed kinds via typedKinds, CRDs via gvr.Group). Threading +// it through lets the builder distinguish CRDs that share +// kind+namespace+name across groups (e.g. Knative Service vs corev1 +// Service) in its per-resource issue index. +type SummaryBuilderFunc func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext + // Provider abstracts the cache so tests can inject a fake. type Provider interface { ListTyped(kind string, namespaces []string) ([]runtime.Object, error) @@ -106,9 +121,27 @@ type Options struct { // drop the candidate. Compile happens in the handler; this layer // just runs the program. Filter *CELFilter + // SummaryBuilder, when non-nil, is invoked per matched hit to + // attach the compact summaryContext (managedBy + health + + // issueCount). Handlers provide a closure that wraps the + // request-scoped topology + per-namespace issue index so the + // per-row cost stays flat. Pass nil to opt out (context=none) — + // the field is omitempty and consumers must tolerate its absence. + SummaryBuilder SummaryBuilderFunc } // Search runs the parsed query against the provider and returns ranked hits. +// pendingHit pairs a Hit with the source object that produced it, so the +// SummaryBuilder (topology lookups, issue-index reads) can be deferred +// until AFTER the hits are sorted and truncated to opts.Limit. Lifecycle is +// strictly internal to Search — never escapes the function. +type pendingHit struct { + hit Hit + obj runtime.Object // typed source (nil for CRD hits) + u *unstructured.Unstructured // unstructured source (nil for typed hits) + c candidate // for c.Group/Kind/Namespace/Name when invoking SummaryBuilder +} + func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, error) { if opts.Limit <= 0 { opts.Limit = DefaultLimit @@ -118,7 +151,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } var res Result - var hits []Hit + // Buffer hits along with the source object so summaryBuilder (topology + // lookups, issue-index reads) can run AFTER sort + truncate — without + // this, broad queries pay topology lookups for thousands of matches + // only to ship at most opts.Limit of them. + var pending []pendingHit // CEL filter eval errors are silently dropped per-row (the agent // just gets fewer hits, no 500), but we log the first error so an // operator can see when rows are dying to runtime issues — typical @@ -204,7 +241,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, obj, nil)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, c, opts.Include, obj, nil), + obj: obj, + c: c, + }) } } @@ -270,7 +311,11 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err continue } } - hits = append(hits, buildHit(score, matched, c, opts.Include, nil, u)) + pending = append(pending, pendingHit{ + hit: buildHit(score, matched, c, opts.Include, nil, u), + u: u, + c: c, + }) } } @@ -282,21 +327,34 @@ func Search(ctx context.Context, p Provider, q Query, opts Options) (Result, err } } - sort.SliceStable(hits, func(i, j int) bool { - if hits[i].Score != hits[j].Score { - return hits[i].Score > hits[j].Score + sort.SliceStable(pending, func(i, j int) bool { + if pending[i].hit.Score != pending[j].hit.Score { + return pending[i].hit.Score > pending[j].hit.Score } - if hits[i].Kind != hits[j].Kind { - return hits[i].Kind < hits[j].Kind + if pending[i].hit.Kind != pending[j].hit.Kind { + return pending[i].hit.Kind < pending[j].hit.Kind } - if hits[i].Namespace != hits[j].Namespace { - return hits[i].Namespace < hits[j].Namespace + if pending[i].hit.Namespace != pending[j].hit.Namespace { + return pending[i].hit.Namespace < pending[j].hit.Namespace } - return hits[i].Name < hits[j].Name + return pending[i].hit.Name < pending[j].hit.Name }) - res.TotalMatched = len(hits) - if len(hits) > opts.Limit { - hits = hits[:opts.Limit] + res.TotalMatched = len(pending) + if len(pending) > opts.Limit { + pending = pending[:opts.Limit] + } + + // Summary attach happens HERE — after truncation — so the topology + // lookups + issue-index reads only run for the hits we'll actually + // ship. Skipped entirely when SummaryBuilder is nil (caller opted out + // via context=none). + hits := make([]Hit, len(pending)) + for i := range pending { + hits[i] = pending[i].hit + if opts.SummaryBuilder != nil { + c := pending[i].c + hits[i].SummaryContext = opts.SummaryBuilder(pending[i].obj, pending[i].u, c.Group, c.Kind, c.Namespace, c.Name) + } } res.Hits = hits res.Total = len(hits) @@ -345,7 +403,10 @@ func isClusterScopedKind(kind string) bool { // buildHit assembles the response shape for a matched candidate. Exactly // one of obj/u will be non-nil. minify-on-demand keeps the cost of -// IncludeNone (identity-only) flat. +// IncludeNone (identity-only) flat. SummaryContext attachment is NOT +// done here — it happens in Search's post-truncation loop so the +// expensive topology lookups + issue-index reads only run for the hits +// that survive sort + Limit truncation. func buildHit(score int, matched []MatchedField, c candidate, mode IncludeMode, obj runtime.Object, u *unstructured.Unstructured) Hit { h := Hit{ Score: score, diff --git a/internal/search/summary_context_test.go b/internal/search/summary_context_test.go new file mode 100644 index 000000000..0f8b3228e --- /dev/null +++ b/internal/search/summary_context_test.go @@ -0,0 +1,92 @@ +package search + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// TestSearch_SummaryBuilderAttached pins the wiring: when Options.SummaryBuilder +// is non-nil, the executor invokes it per kept hit and the result lands +// in Hit.SummaryContext. +func TestSearch_SummaryBuilderAttached(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, + }, + }, + }, + }, + } + + var calls int + var gotGroup string + builder := func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + calls++ + gotGroup = group + return &resourcecontext.ResourceSummaryContext{ + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: namespace}, + Health: "healthy", + IssueCount: 0, + } + } + + res, _ := Search(context.Background(), p, Parse("api-1"), Options{ + Include: IncludeNone, + SummaryBuilder: builder, + }) + if calls != 1 { + t.Fatalf("SummaryBuilder calls = %d, want 1", calls) + } + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + h := res.Hits[0] + if h.SummaryContext == nil { + t.Fatalf("SummaryContext not attached to hit: %+v", h) + } + if h.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", h.SummaryContext.Health) + } + if h.SummaryContext.ManagedBy == nil || h.SummaryContext.ManagedBy.Name != "api" { + t.Errorf("ManagedBy mismatch: %+v", h.SummaryContext.ManagedBy) + } + // Pod is core-group — builder should see "" for group, threaded + // through from candidate.Group (set on the typed walker via tk.Group). + if gotGroup != "" { + t.Errorf("builder saw group=%q for core-group Pod, want \"\"", gotGroup) + } +} + +// TestSearch_NoSummaryBuilder_LeavesNilContext is the opt-out path +// (context=none in the handler maps to nil SummaryBuilder here). Hits +// must have no SummaryContext. +func TestSearch_NoSummaryBuilder_LeavesNilContext(t *testing.T) { + p := &fakeProvider{ + typed: map[string][]runtime.Object{ + "pods": { + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "prod", Name: "api-1"}, + }, + }, + }, + } + res, _ := Search(context.Background(), p, Parse("api-1"), Options{Include: IncludeNone}) + if len(res.Hits) != 1 { + t.Fatalf("hits = %d, want 1", len(res.Hits)) + } + if res.Hits[0].SummaryContext != nil { + t.Errorf("expected nil SummaryContext when SummaryBuilder unset, got %+v", res.Hits[0].SummaryContext) + } +} diff --git a/internal/search/types.go b/internal/search/types.go index 83551ba86..b33215853 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -1,6 +1,9 @@ package search -import "github.com/skyhook-io/radar/internal/filter" +import ( + "github.com/skyhook-io/radar/internal/filter" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) const ( DefaultLimit = 50 @@ -46,6 +49,10 @@ type Hit struct { Summary any `json:"summary,omitempty"` Raw any `json:"raw,omitempty"` Matched []MatchedField `json:"matched,omitempty"` + // SummaryContext is the compact per-row enrichment (managedBy, health, + // issueCount). Populated by handlers via Options.SummaryBuilder; nil + // when the caller opted out (context=none) or no fields apply. + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // MatchedField records where a query token landed (debug + UI highlight). @@ -86,4 +93,3 @@ const ( IncludeRaw IncludeNone // identity only (cheapest) ) - diff --git a/internal/server/ai_handlers.go b/internal/server/ai_handlers.go index 10593ccfd..40db87bc9 100644 --- a/internal/server/ai_handlers.go +++ b/internal/server/ai_handlers.go @@ -41,6 +41,7 @@ import ( "github.com/skyhook-io/radar/internal/audit" "github.com/skyhook-io/radar/internal/issues" "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/internal/summarycontext" aicontext "github.com/skyhook-io/radar/pkg/ai/context" bpaudit "github.com/skyhook-io/radar/pkg/audit" "github.com/skyhook-io/radar/pkg/policyreports" @@ -94,19 +95,33 @@ func parseVerbosity(r *http.Request, defaultLevel aicontext.VerbosityLevel) aico } // handleAIListResources returns a minified list of resources for AI consumption. -// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact +// GET /api/ai/resources/{kind}?namespace=X&group=X&verbosity=summary|detail|compact&context=none +// +// summaryContext (managedBy + health + issueCount) is attached per row +// at Summary verbosity by default. Pass ?context=none to opt out for a +// bare list. func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { if !s.requireConnected(w) { return } - kind := chi.URLParam(r, "kind") + kind := normalizeKind(chi.URLParam(r, "kind")) + group := r.URL.Query().Get("group") + level := parseVerbosity(r, aicontext.LevelSummary) + skipContext := r.URL.Query().Get("context") == "none" + + // parseNamespacesForUser primes the per-user perm cache. preflightResourceList + // then enforces the same RBAC gates as the REST list path (cluster-scoped + // SAR for cluster-only kinds, list-namespaces SAR for `kind=namespaces`, + // per-namespace and/or cluster-wide list-secrets SAR for `kind=secrets`). + // AI callers get an explicit 403 on deny instead of the empty-list shape + // the REST handler returns for backward compat. namespaces := s.parseNamespacesForUser(r) - if noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) + finalNamespaces, status, msg, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeError(w, status, msg) return } - group := r.URL.Query().Get("group") - level := parseVerbosity(r, aicontext.LevelSummary) + namespaces = finalNamespaces cache := k8s.GetResourceCache() if cache == nil { @@ -114,11 +129,23 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } - // Try typed cache first + // When a group is specified, route straight to the dynamic cache so + // CRDs whose plural collides with a core kind (e.g. Knative + // serving.knative.dev/Service vs corev1 ""/Service, KEDA's HPA-like + // kinds) reach the right resource. FetchResourceList is group-blind + // — it would silently return the core typed list, dropping the + // query's group filter on the floor. Mirrors the same group-aware + // short-circuit in handleGetResource (PR #721). + if group != "" { + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) + return + } + + // Try typed cache first (group=="" → core/built-in lookup). objs, err := k8s.FetchResourceList(cache, kind, namespaces) if err == k8s.ErrUnknownKind { // Fall through to dynamic cache for CRDs - s.aiListDynamic(w, r, cache, kind, namespaces, group, level) + s.aiListDynamic(w, r, cache, kind, namespaces, group, level, skipContext) return } if err != nil { @@ -136,11 +163,43 @@ func (s *Server) handleAIListResources(w http.ResponseWriter, r *http.Request) { return } + // Attach summaryContext per row at Summary verbosity. Compact/Detail + // already carry richer context on the get-resource path; the + // per-row attachment is specifically for cheap list triage. + // + // For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) issues + // live at namespace="" — scoping the issue index to the user's + // namespace set would silently zero issueCount on every row. The + // preflight RBAC above has already authorized cluster-scoped reads, + // so we pass nil here to compose cluster-wide. + if !skipContext && level == aicontext.LevelSummary { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + // Typed list resolves group from each object's TypeMeta — + // MinifyList sets it via SetTypeMeta before producing rows, + // so we can trust apiVersion on the typed source. + summarycontext.AttachToTypedList(results, objs, builder) + } + } + s.writeJSON(w, results) } +// issueIndexNamespaces returns the namespace slice to scope the issue +// index by. For cluster-scoped kinds (Node, PV, cluster-scoped CRDs) +// returns nil so cluster-scoped issues (which live at namespace="") are +// not filtered out by the user's namespace-restricted access set. +// Namespaced kinds pass through unchanged. +func issueIndexNamespaces(namespaces []string, kind, group string) []string { + clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group) + if clusterScoped { + return nil + } + return namespaces +} + // aiListDynamic handles the CRD/dynamic fallback for AI list. -func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel) { +func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8s.ResourceCache, kind string, namespaces []string, group string, level aicontext.VerbosityLevel, skipContext bool) { var allItems []*unstructured.Unstructured if len(namespaces) > 0 { @@ -174,6 +233,13 @@ func (s *Server) aiListDynamic(w http.ResponseWriter, r *http.Request, cache *k8 results = append(results, aicontext.MinifyUnstructured(item, level)) } + if !skipContext && level == aicontext.LevelSummary { + idxNamespaces := issueIndexNamespaces(namespaces, kind, group) + if builder := s.newResourceSummaryContextBuilder(idxNamespaces); builder != nil { + summarycontext.AttachToUnstructuredList(results, allItems, builder) + } + } + s.writeJSON(w, results) } diff --git a/internal/server/ai_handlers_rbac_test.go b/internal/server/ai_handlers_rbac_test.go index 0279bf243..43bf70415 100644 --- a/internal/server/ai_handlers_rbac_test.go +++ b/internal/server/ai_handlers_rbac_test.go @@ -99,3 +99,173 @@ func TestProxyAuth_AIGetPod_NamespaceAllowed(t *testing.T) { t.Errorf("expected 'resourceContext' field in AI get response, got: %+v", body) } } + +// AI list path RBAC at the /api/ai/resources/{kind} layer. +// +// handleAIListResources shares preflightResourceList with +// handleListResources so the same gates run on both paths: +// - cluster-scoped SAR for Node / cluster-scoped CRDs +// - list-namespaces SAR for `kind=namespaces` +// - per-namespace and/or cluster-wide list-secrets SAR for `kind=secrets` +// +// Where the REST path returns 200 with `[]` for denies (legacy SPA +// shape that doesn't leak kind existence), the AI path returns the +// explicit status so agents see the failure instead of confusing +// "empty cluster" output. + +func TestAI_SecretsList_PerNamespaceDenied_Returns403(t *testing.T) { + // alice has namespace access to default but per-namespace + // `list secrets` is denied. preflightResourceList must intercept + // before reaching the cache. + env := newAuthTestServer(t) + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + seedServerSecretListCanI(t, env, "alice", nil, []string{"default"}) + + resp := env.authGet(t, "/api/ai/resources/secrets?namespace=default", "alice", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI secrets list with per-namespace deny, got %d", resp.StatusCode) + } +} + +func TestAI_NodesList_NoClusterRBAC_Returns403(t *testing.T) { + // Nodes are cluster-scoped. Cluster-wide pod visibility + // (AllowedNamespaces nil sentinel) is not a license to read + // cluster-scoped kinds — the SAR-level gate must reject. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "nodes", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/nodes", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI nodes list without cluster-scope RBAC, got %d", resp.StatusCode) + } +} + +func TestAI_NamespacesList_NoListNamespacesSAR_Returns403(t *testing.T) { + // /api/ai/resources/namespaces returns full Namespace objects. + // Strict SAR gate — cluster-wide pod RBAC alone is not sufficient. + env := newAuthTestServer(t) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("list", "", "namespaces", "", false) + env.srv.permCache.Set("broad-reader", perms) + + resp := env.authGet(t, "/api/ai/resources/namespaces", "broad-reader", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 for AI namespaces list without list-namespaces SAR, got %d", resp.StatusCode) + } +} + +// TestAI_ListServices_WithGroup_RoutesToDynamicCache pins the group-aware +// short-circuit in handleAIListResources. For kind=services with no group, +// the typed core Service list path returns the seeded nginx Service. For +// kind=services&group=serving.knative.dev, the handler must skip the +// typed cache (which is group-blind — it would silently return core +// Services and drop the group filter on the floor) and route through +// aiListDynamic instead. Mirrors the same fix on GET in PR #721. +// +// The smoke TestMain seeds typed caches only; the dynamic resource cache +// isn't initialized, so the dynamic path surfaces a 500 with "resource +// discovery not initialized". That 500 IS the assertion: pre-fix the +// handler would return 200 with the core Service rows (silent +// wrong-kind result), which is the bug. +func TestAI_ListServices_WithGroup_RoutesToDynamicCache(t *testing.T) { + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + // Baseline: no group → typed cache returns the seeded core Service. + respCore := env.authGet(t, "/api/ai/resources/services?namespace=default", "bob", "") + defer respCore.Body.Close() + if respCore.StatusCode != http.StatusOK { + t.Fatalf("baseline (no group): expected 200, got %d", respCore.StatusCode) + } + var coreRows []map[string]any + if err := json.NewDecoder(respCore.Body).Decode(&coreRows); err != nil { + t.Fatalf("decode core: %v", err) + } + var foundNginxSvc bool + for _, row := range coreRows { + if row["kind"] == "Service" && row["name"] == "nginx" { + foundNginxSvc = true + break + } + } + if !foundNginxSvc { + t.Fatalf("baseline (no group): expected nginx Service in typed list, got %+v", coreRows) + } + + // With group: must route through aiListDynamic. Dynamic cache isn't + // initialized in the smoke harness, so we expect either 400 ("unknown + // resource kind") or 500 ("dynamic resource cache not initialized" / + // "resource discovery not initialized") — anything BUT a 200 with + // core Services, which is the pre-fix wrong-result path. + respCRD := env.authGet(t, "/api/ai/resources/services?namespace=default&group=serving.knative.dev", "bob", "") + defer respCRD.Body.Close() + if respCRD.StatusCode == http.StatusOK { + var crdRows []map[string]any + if err := json.NewDecoder(respCRD.Body).Decode(&crdRows); err == nil { + for _, row := range crdRows { + if row["name"] == "nginx" { + t.Fatalf("group=serving.knative.dev leaked typed core Service into result (pre-fix bug): row=%+v", row) + } + } + } + } + if respCRD.StatusCode != http.StatusBadRequest && respCRD.StatusCode != http.StatusInternalServerError && respCRD.StatusCode != http.StatusOK { + t.Fatalf("group=serving.knative.dev: unexpected status %d (want 400/500 from uninitialized dynamic cache, or 200 with non-core rows)", respCRD.StatusCode) + } +} + +func TestAI_DeploymentsList_HappyPath_AttachesSummaryContext(t *testing.T) { + // Allowed user, summary-verbosity default. The envelope must + // include the seeded nginx deployment AND each row must carry a + // summaryContext field (the load-bearing new wire shape this PR + // adds — pin it so a refactor that skipped attachment surfaces + // here). + env := newAuthTestServer(t) + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + + resp := env.authGet(t, "/api/ai/resources/deployments", "bob", "") + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + var rows []map[string]any + if err := json.NewDecoder(resp.Body).Decode(&rows); err != nil { + t.Fatalf("decode: %v", err) + } + if len(rows) == 0 { + t.Fatalf("allowed user got 0 deployments, expected seeded nginx") + } + + // AI list rows are flat (kind/name/namespace at the top level — + // the minified shape, distinct from the REST handler's K8s-native + // metadata-nested objects). Find the nginx row and assert + // summaryContext is present. Empty map is acceptable (the + // deployment is healthy and not managed by an external + // controller) — what matters is the envelope field exists so + // consumers don't have to special-case its absence. + var found bool + for _, row := range rows { + if row["name"] != "nginx" { + continue + } + found = true + if _, has := row["summaryContext"]; !has { + t.Errorf("nginx row missing summaryContext envelope: %+v", row) + } + } + if !found { + t.Errorf("nginx deployment not in AI list response: %+v", rows) + } +} diff --git a/internal/server/search_handler.go b/internal/server/search_handler.go index 486224ae4..7e5f56f4e 100644 --- a/internal/server/search_handler.go +++ b/internal/server/search_handler.go @@ -102,6 +102,23 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { return s.canRead(r, group, resource, "", "list") }, } + // summaryContext attaches managedBy/health/issueCount per hit. Build + // the per-request closure once (one Compose call + cached topology + // snapshot) and let the search executor invoke it per kept hit. + // ?context=none opts out so legacy callers don't pay for the join. + // + // Search uses the dual-index variant: hits are mixed-kind in one + // response (namespaced Pods alongside cluster-scoped Nodes), so a + // single-namespace-scoped issue index would zero issueCount on + // cluster-scoped hits (whose issues live at namespace=""). The + // builder routes per-hit by scope. SAR gating above + // (CanReadClusterScoped) already constrains which cluster-scoped + // kinds are reachable. + if r.URL.Query().Get("context") != "none" { + if builder := s.newSearchSummaryContextBuilder(scanNamespaces); builder != nil { + opts.SummaryBuilder = search.SummaryBuilderFunc(builder) + } + } if expr := r.URL.Query().Get("filter"); expr != "" { f, err := filter.CachedObjectFilter(expr) if err != nil { diff --git a/internal/server/server.go b/internal/server/server.go index 11aea51d9..836a89639 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1010,17 +1010,20 @@ func (s *Server) handleAPIResources(w http.ResponseWriter, r *http.Request) { s.writeJSON(w, resources) } -func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { - if !s.requireConnected(w) { - return - } - kind := normalizeKind(chi.URLParam(r, "kind")) - group := r.URL.Query().Get("group") // API group for CRD disambiguation - - // parseNamespacesForUser primes the per-user perm cache (triggers - // DiscoverNamespaces if needed). canRead below relies on it. - namespaces := s.parseNamespacesForUser(r) - +// preflightResourceList runs the per-user RBAC gates shared by the REST +// (/api/resources/{kind}) and AI (/api/ai/resources/{kind}) list paths. +// It assumes the caller has already populated `namespaces` via +// parseNamespacesForUser (which primes the canI cache that canRead relies on) +// and has classified the kind for cluster-scope. +// +// Returns the (possibly-rewritten) namespace slice that downstream cache +// reads should use. When ok=false the gate denied or the user has no +// namespace access; (status, msg) carry the canonical HTTP response. REST +// callers historically convert denies to a 200 with `[]` to avoid leaking +// kind existence; the AI path returns the explicit status so agents see the +// failure. Same gates run in the same order on both paths — the response +// shape is the only thing that differs. +func (s *Server) preflightResourceList(r *http.Request, kind, group string, namespaces []string) (finalNamespaces []string, status int, msg string, ok bool) { // "namespaces" is cluster-scoped at the K8s API. Full Namespace objects // (labels, annotations, spec) require explicit list-namespaces SAR. // AllowedNamespaces is NOT a sufficient fallback: list-pods-in-alpha @@ -1032,10 +1035,9 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { isNamespacesKind := kind == "namespaces" || kind == "namespace" if isNamespacesKind { if !s.canRead(r, "", "namespaces", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list namespaces", false } - namespaces = nil // full lister output for SAR-authorized users + return nil, 0, "", true // full lister output for SAR-authorized users } // Cluster-only kinds (Nodes, PVs, StorageClasses, ClusterRoles, cluster- @@ -1043,19 +1045,19 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // noNamespaceAccess check so a user with explicit cluster-scoped RBAC but // no namespace access can still read those resources. isClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(kind, group) - if isClusterScoped && !isNamespacesKind { + if isClusterScoped { if !s.canRead(r, gvrGroup, gvrResource, "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, fmt.Sprintf("insufficient permissions to list %s", kind), false } // Cluster-scoped reads have no namespace dimension. Once the // resource-level SAR passes, force the later typed/dynamic cache paths // through their cluster-wide branch even if the user also has a // namespace view preference. - namespaces = nil - } else if !isNamespacesKind && noNamespaceAccess(namespaces) { - s.writeJSON(w, []any{}) - return + return nil, 0, "", true + } + + if noNamespaceAccess(namespaces) { + return namespaces, http.StatusForbidden, "no namespace access", false } // Per-kind RBAC inside a namespace. Helm release storage IS K8s Secrets, @@ -1064,26 +1066,48 @@ func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { // radar/templates/clusterrole.yaml). When any of those triggers fires // the cache holds every secret in the cluster, so per-user RBAC must // gate the read. Other namespaced kinds are deferred. - if (kind == "secrets" || kind == "secret") && !isClusterScoped { + if kind == "secrets" || kind == "secret" { if auth.UserFromContext(r.Context()) != nil { if namespaces == nil { // Auth user with cluster-wide namespace access (e.g. picked up // via DiscoverNamespaces stage 1: cluster-wide list pods). The // cache will serve all secrets — gate on cluster-scope SAR. if !s.canRead(r, "", "secrets", "", "list") { - s.writeJSON(w, []any{}) - return + return nil, http.StatusForbidden, "insufficient permissions to list secrets", false } } else { namespaces = s.filterNamespacesByCanRead(r, "", "secrets", "list", namespaces) if len(namespaces) == 0 { - s.writeJSON(w, []any{}) - return + return namespaces, http.StatusForbidden, "insufficient permissions to list secrets", false } } } } + return namespaces, 0, "", true +} + +func (s *Server) handleListResources(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + kind := normalizeKind(chi.URLParam(r, "kind")) + group := r.URL.Query().Get("group") // API group for CRD disambiguation + + // parseNamespacesForUser primes the per-user perm cache (triggers + // DiscoverNamespaces if needed). canRead below relies on it. + namespaces := s.parseNamespacesForUser(r) + + // Shared RBAC gate. REST converts denies to 200 with `[]` (legacy shape + // the SPA tolerates and that doesn't leak kind existence); the AI path + // returns the explicit status. + finalNamespaces, _, _, ok := s.preflightResourceList(r, kind, group, namespaces) + if !ok { + s.writeJSON(w, []any{}) + return + } + namespaces = finalNamespaces + cache := k8s.GetResourceCache() if cache == nil { s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") diff --git a/internal/server/summary_context.go b/internal/server/summary_context.go new file mode 100644 index 000000000..6853930a1 --- /dev/null +++ b/internal/server/summary_context.go @@ -0,0 +1,70 @@ +// Per-request helpers that compute the compact ResourceSummaryContext attached +// to /api/ai/resources/{kind} list rows and /api/search hits. +// +// The shared core (issue index, kind canonicalization, managedBy +// resolution, per-row scope dispatch) lives in +// internal/summarycontext. This file is the REST-specific wrapper — +// it sources topology from the server-wide broadcaster cache and +// otherwise just plumbs arguments through. + +package server + +import ( + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/summarycontext" +) + +// newResourceSummaryContextBuilder assembles the per-request closure for the +// list/search handlers. Returns nil when the cache or topology isn't +// available, in which case callers should skip context attachment +// rather than emit empty objects. +// +// Callers pass the namespace list they're scanning so the issue index +// is scoped to just those rows (the full Compose call on a 100-namespace +// cluster is fine; this is mostly belt-and-suspenders for very large +// envs). Pass nil to compose cluster-wide. +// +// Use newSearchSummaryContextBuilder for search, which routes per-hit +// between a namespaced and a cluster-wide index — search returns mixed +// kinds in one response, so a single index can't get both right. +func (s *Server) newResourceSummaryContextBuilder(namespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + idx := summarycontext.BuildIssueIndex(provider, namespaces) + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), idx, idx) +} + +// newSearchSummaryContextBuilder is the search-specific variant. Search +// hits are MIXED-kind in one response — a single query can return both +// namespaced Pods and cluster-scoped Nodes. A single issue index can't +// be both: scoped to the user's namespaces it would silently zero +// issueCount on Node/PV/cluster-scoped CRD hits (whose issues live at +// namespace=""); composed cluster-wide it would over-count or pull in +// rows the namespace-restricted user shouldn't see. +// +// Fix: build two indexes per request. namespacedIdx is scoped to +// scanNamespaces (intersection of user RBAC and the query's `ns:` +// modifier). clusterIdx is composed cluster-wide (nil filter) so +// namespace="" issues surface. The returned closure dispatches per-hit +// via k8s.ClassifyKindScope(kind, group). Search-level RBAC +// (CanReadClusterScoped) already gated which cluster-scoped kinds the +// user can see, so the cluster-wide index doesn't expose unauthorized +// rows. +// +// The cluster-wide index is skipped when scanNamespaces is already nil +// (cluster-wide user) — both indexes would be identical, so one pass +// suffices. +func (s *Server) newSearchSummaryContextBuilder(scanNamespaces []string) summarycontext.Builder { + provider := issues.NewCacheProvider() + if provider == nil { + return nil + } + namespacedIdx := summarycontext.BuildIssueIndex(provider, scanNamespaces) + clusterIdx := namespacedIdx + if scanNamespaces != nil { + clusterIdx = summarycontext.BuildIssueIndex(provider, nil) + } + return summarycontext.BuilderFromIndexes(s.broadcaster.GetCachedTopology(), namespacedIdx, clusterIdx) +} diff --git a/internal/server/summary_context_test.go b/internal/server/summary_context_test.go new file mode 100644 index 000000000..af4fb3f22 --- /dev/null +++ b/internal/server/summary_context_test.go @@ -0,0 +1,231 @@ +// Wiring tests for the REST-side ResourceSummaryContext builders. The pure- +// function tests (issueIndex key arithmetic, BuildIssueIndex over a +// fake provider, CanonicalSingular, ManagedByFromRelationships) live in +// internal/summarycontext alongside the shared core they exercise. + +package server + +import ( + "encoding/json" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/summarycontext" + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/pkg/resourcecontext" +) + +// stubBuilder records calls and returns a deterministic ResourceSummaryContext +// keyed by the resource identity. Avoids standing up a topology cache or +// issue provider — those are exercised by the per-layer unit tests. +// +// Key shape mirrors the production issueIndexKey (group|kind|ns|name) +// so test fixtures pin the group-aware lookup. +func stubBuilder(t *testing.T, want map[string]*resourcecontext.ResourceSummaryContext) summarycontext.Builder { + t.Helper() + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + key := group + "|" + kind + "|" + namespace + "|" + name + return want[key] + } +} + +// TestAttachResourceSummaryContextToList wires together MinifyList + the +// per-row attach helper and asserts the ResourceSummaryContext field lands in +// the JSON each row marshals to. +func TestAttachResourceSummaryContextToList(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-1", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning, ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}}, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "api-2", Namespace: "prod"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + } + // Group is "" for core-group Pods. + want := map[string]*resourcecontext.ResourceSummaryContext{ + "|Pod|prod|api-1": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "healthy", + IssueCount: 0, + }, + "|Pod|prod|api-2": { + ManagedBy: &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + Health: "unhealthy", + IssueCount: 3, + }, + } + + results, err := aicontext.MinifyList(objs, aicontext.LevelSummary) + if err != nil { + t.Fatalf("MinifyList: %v", err) + } + summarycontext.AttachToTypedList(results, objs, stubBuilder(t, want)) + + // Row 0 — healthy pod. + b, _ := json.Marshal(results[0]) + wantSubs := []string{ + `"summaryContext":`, + `"managedBy":{"kind":"Deployment"`, + `"health":"healthy"`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 0 missing %s in %s", sub, b) + } + } + + // Row 1 — unhealthy pod with issueCount. + b, _ = json.Marshal(results[1]) + wantSubs = []string{ + `"health":"unhealthy"`, + `"issueCount":3`, + } + for _, sub := range wantSubs { + if !contains(string(b), sub) { + t.Errorf("row 1 missing %s in %s", sub, b) + } + } +} + +// TestAttachResourceSummaryContextToList_MismatchedLengthsSilent — defensive +// path that protects against a future refactor where MinifyList might +// drop unsupported kinds. Attach must skip rather than panic. +func TestAttachResourceSummaryContextToList_MismatchedLengthsSilent(t *testing.T) { + objs := []runtime.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "api-1"}}, + } + results := []any{ + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-1"}, + &aicontext.ResourceSummary{Kind: "Pod", Name: "api-2"}, + } + // Length mismatch (1 obj vs 2 results) — must not panic, must skip. + summarycontext.AttachToTypedList(results, objs, func(obj runtime.Object, _ *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + return &resourcecontext.ResourceSummaryContext{Health: "healthy"} + }) + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok { + t.Fatalf("row %d: unexpected type %T", i, row) + } + if summary.SummaryContext != nil { + t.Errorf("row %d: ResourceSummaryContext should be nil on length mismatch, got %#v", i, summary.SummaryContext) + } + } +} + +// TestAttachResourceSummaryContextToUnstructuredList covers the dynamic-CRD +// path. summarizeUnstructured returns *ResourceSummary so the attach +// helper is symmetric with the typed path. +func TestAttachResourceSummaryContextToUnstructuredList(t *testing.T) { + items := []*unstructured.Unstructured{ + {Object: map[string]any{ + "apiVersion": "argoproj.io/v1alpha1", + "kind": "Application", + "metadata": map[string]any{"name": "storefront", "namespace": "argocd"}, + "status": map[string]any{"conditions": []any{map[string]any{"type": "Ready", "status": "True"}}}, + }}, + } + want := map[string]*resourcecontext.ResourceSummaryContext{ + "argoproj.io|Application|argocd|storefront": { + Health: "healthy", + IssueCount: 1, + }, + } + + results := []any{aicontext.MinifyUnstructured(items[0], aicontext.LevelSummary)} + summarycontext.AttachToUnstructuredList(results, items, stubBuilder(t, want)) + + summary, ok := results[0].(*aicontext.ResourceSummary) + if !ok || summary == nil { + t.Fatalf("unexpected row type %T", results[0]) + } + if summary.SummaryContext == nil { + t.Fatalf("ResourceSummaryContext not attached") + } + if summary.SummaryContext.Health != "healthy" { + t.Errorf("Health = %q, want healthy", summary.SummaryContext.Health) + } + if summary.SummaryContext.IssueCount != 1 { + t.Errorf("IssueCount = %d, want 1", summary.SummaryContext.IssueCount) + } +} + +// contains is a tiny strings.Contains alias kept local so the test file +// doesn't need a strings import alongside the existing imports. +func contains(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +} + +// TestIssueIndexNamespaces_ClusterScopedDropsFilter pins the fix for the +// "cluster-scoped issues filtered out for cluster-scoped rows" bug. +// Pre-fix, handleAIListResources passed the user's namespaced-access set +// straight into the issue index. For cluster-scoped kinds (Node, PV, +// cluster-scoped CRDs) every issue lives at namespace="" — the index +// then dropped them all, silently zeroing issueCount on every row even +// when the user had cluster-scoped read access. The helper now returns +// nil for cluster-scoped kinds so Compose runs cluster-wide. +func TestIssueIndexNamespaces_ClusterScopedDropsFilter(t *testing.T) { + userNs := []string{"prod", "staging"} + + // Cluster-scoped built-ins from the static catalogue (ClassifyKindScope + // hits ClusterOnlyKindGVR before touching discovery, so this works + // without a discovery client wired up). + clusterCases := []struct { + kind string + group string + }{ + {"Node", ""}, + {"nodes", ""}, + {"PersistentVolume", ""}, + {"ClusterRole", "rbac.authorization.k8s.io"}, + {"StorageClass", "storage.k8s.io"}, + } + for _, tc := range clusterCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if got != nil { + t.Errorf("issueIndexNamespaces(%q, %q) = %v, want nil — cluster-scoped kinds must not be namespace-filtered", + tc.kind, tc.group, got) + } + } + + // Namespaced kinds preserve the user's namespace set as-is so the + // scoping the per-user RBAC enforced upstream is honored. + namespacedCases := []struct { + kind string + group string + }{ + {"Pod", ""}, + {"Deployment", "apps"}, + {"ConfigMap", ""}, + } + for _, tc := range namespacedCases { + got := issueIndexNamespaces(userNs, tc.kind, tc.group) + if len(got) != len(userNs) { + t.Errorf("issueIndexNamespaces(%q, %q) len = %d, want %d (namespace filter must pass through for namespaced kinds)", + tc.kind, tc.group, len(got), len(userNs)) + continue + } + for i := range got { + if got[i] != userNs[i] { + t.Errorf("issueIndexNamespaces(%q, %q)[%d] = %q, want %q", + tc.kind, tc.group, i, got[i], userNs[i]) + } + } + } + + // Pass-through when caller already provided nil (cluster-wide). + if got := issueIndexNamespaces(nil, "Pod", ""); got != nil { + t.Errorf("issueIndexNamespaces(nil, Pod) = %v, want nil", got) + } +} diff --git a/internal/summarycontext/attach.go b/internal/summarycontext/attach.go new file mode 100644 index 000000000..3f0ea18ce --- /dev/null +++ b/internal/summarycontext/attach.go @@ -0,0 +1,70 @@ +package summarycontext + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + aicontext "github.com/skyhook-io/radar/pkg/ai/context" + "github.com/skyhook-io/radar/internal/k8s" +) + +// AttachToTypedList fills in SummaryContext for each *aicontext.ResourceSummary +// row produced from typed runtime.Object items (typed-cache list path). +// results and objs must be parallel slices — length mismatch is treated as a +// caller bug and the function returns without touching the rows. +// +// Group is sourced per-object from the typed object's GVK via SetTypeMeta + +// GetObjectKind, so list paths that mix kinds stay correct. +func AttachToTypedList(results []any, objs []runtime.Object, builder Builder) { + if len(results) != len(objs) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromObject(objs[i]) + summary.SummaryContext = builder(objs[i], nil, group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// AttachToUnstructuredList is the dynamic-CRD counterpart of +// AttachToTypedList. Group comes from each item's apiVersion so two CRDs that +// share kind+ns+name across API groups (e.g. multiple operators each shipping +// a "Cluster" resource) get independent issue counts. +func AttachToUnstructuredList(results []any, items []*unstructured.Unstructured, builder Builder) { + if len(results) != len(items) { + return + } + for i, row := range results { + summary, ok := row.(*aicontext.ResourceSummary) + if !ok || summary == nil { + continue + } + group := GroupFromUnstructured(items[i]) + summary.SummaryContext = builder(nil, items[i], group, summary.Kind, summary.Namespace, summary.Name) + } +} + +// GroupFromObject extracts the API group from a typed runtime.Object's +// GroupVersionKind. Returns "" for core-group objects (Pod, Service, etc.) +// and when the GVK is unset. Calls k8s.SetTypeMeta so the GVK is populated +// from scheme metadata when the object came out of the typed cache without +// it set. +func GroupFromObject(obj runtime.Object) string { + if obj == nil { + return "" + } + k8s.SetTypeMeta(obj) + return obj.GetObjectKind().GroupVersionKind().Group +} + +// GroupFromUnstructured pulls the API group from an unstructured's apiVersion. +// Mirrors GroupFromObject for the dynamic-CRD path. +func GroupFromUnstructured(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + return u.GroupVersionKind().Group +} diff --git a/internal/summarycontext/summarycontext.go b/internal/summarycontext/summarycontext.go new file mode 100644 index 000000000..e375e6ab2 --- /dev/null +++ b/internal/summarycontext/summarycontext.go @@ -0,0 +1,225 @@ +// Package summarycontext is the shared core that powers the compact +// ResourceSummaryContext attached to /api/ai/resources/{kind} list rows, /api/search +// hits, and the MCP list_resources / search variants. +// +// The REST and MCP wrappers (internal/server, internal/mcp) differ only +// in their topology source — REST reads from a server-wide broadcaster +// cache; MCP memoizes per-process builds. Everything else (issue index, +// kind canonicalization, managedBy resolution, per-row dispatch by +// scope) is identical, so it lives here. +// +// pkg/resourcecontext intentionally has no dependencies on internal/* +// or pkg/topology; the join happens here. +package summarycontext + +import ( + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// Builder is the per-request closure that produces a ResourceSummaryContext for +// a single resource. nil result is fine — the ResourceSummaryContext field is +// omitempty on every consumer. +// +// group is required so the per-resource issue lookup can distinguish +// CRDs that share kind+namespace+name across API groups (e.g. Knative +// Service vs corev1 Service, or two custom CRDs both named "Cluster" +// from different operators). Pass "" for core-group resources. +type Builder func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext + +// BuilderFromIndexes assembles the per-request closure. The list path +// passes the same index for both namespacedIdx and clusterIdx (single- +// kind list, scope already chosen by the caller); search passes two +// distinct indexes — namespacedIdx scoped to user namespaces, clusterIdx +// composed cluster-wide. The closure dispatches per-hit by scope so +// cluster-scoped hits read the cluster-wide index and surface +// namespace="" issues that the namespaced filter would otherwise drop. +// +// topo is the topology snapshot the caller has already obtained from +// its preferred source (REST: broadcaster cache; MCP: short-TTL +// memoizer). nil topo is fine — managedBy is omitted but issueCount +// still resolves. +func BuilderFromIndexes(topo *topology.Topology, namespacedIdx, clusterIdx IssueIndex) Builder { + resourceProvider := k8s.NewTopologyResourceProvider(k8s.GetResourceCache()) + dynamicProvider := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + + // One inverted-edges index per request — without it each + // GetRelationships call would re-scan topo.Edges in O(E), turning + // the list/search hot path into O(N × E). See pkg/topology T3. + var relIdx *topology.RelationshipsIndex + if topo != nil { + relIdx = topology.IndexByResource(topo) + } + + return func(obj runtime.Object, u *unstructured.Unstructured, group, kind, namespace, name string) *resourcecontext.ResourceSummaryContext { + var managedBy *resourcecontext.ManagedByRef + if topo != nil { + // Pass the fetched object when available so synthesis is + // group-aware (avoids kind/plural collisions like Knative + // Service vs corev1 Service). Falls back to (kind, ns, name) + // lookup when neither obj nor u is set. + var rawObj any + switch { + case obj != nil: + rawObj = obj + case u != nil: + rawObj = u + } + rel := topology.GetRelationshipsWithObject(kind, namespace, name, rawObj, topo, resourceProvider, dynamicProvider, relIdx) + managedBy = ManagedByFromRelationships(rel) + } + var source runtime.Object = obj + if source == nil && u != nil { + source = u + } + // Dispatch by scope: cluster-scoped hits read clusterIdx (composed + // at namespace=nil so namespace="" issues are present), namespaced + // hits read namespacedIdx (which honors the user's namespace + // filter so the per-row count doesn't pull in noise from + // namespaces the user can't see). + idx := namespacedIdx + if clusterScoped, _, _ := k8s.ClassifyKindScope(kind, group); clusterScoped { + idx = clusterIdx + } + return resourcecontext.BuildSummary(source, resourcecontext.SummaryOptions{ + ManagedBy: managedBy, + IssueCount: idx.Count(group, kind, namespace, name), + }) + } +} + +// IssueIndex keys per-resource issue counts as "group|kind|namespace|name". +// Group goes FIRST so two CRDs sharing kind+namespace+name across API +// groups (e.g. Knative serving.knative.dev/Service vs corev1 ""/Service, +// or two operators each shipping a "Cluster" CRD) get independent counts +// instead of inheriting each other's. Kind is canonicalized via +// CanonicalSingular because issue sources emit the kind as-typed +// (Deployment) while callers may pass the URL plural (deployments) — +// canonicalization normalizes both. "|" can't appear in a Kubernetes API +// group (groups follow DNS subdomain rules), so it's a safe delimiter. +type IssueIndex map[string]int + +// Count returns the per-resource issue count, keyed by the group-aware +// composite key. Zero on miss. +func (i IssueIndex) Count(group, kind, namespace, name string) int { + return i[issueIndexKey(group, kind, namespace, name)] +} + +func issueIndexKey(group, kind, namespace, name string) string { + return group + "|" + strings.ToLower(CanonicalSingular(kind)) + "|" + namespace + "|" + name +} + +// CanonicalSingular collapses common plural forms back to the singular +// kind the issue engine emits. Cheap surface — only the kinds we +// actually scan in list_resources / search. +func CanonicalSingular(kind string) string { + k := strings.ToLower(kind) + switch k { + case "pods": + return "pod" + case "services": + return "service" + case "deployments": + return "deployment" + case "daemonsets": + return "daemonset" + case "statefulsets": + return "statefulset" + case "replicasets": + return "replicaset" + case "jobs": + return "job" + case "cronjobs": + return "cronjob" + case "ingresses": + return "ingress" + case "configmaps": + return "configmap" + case "secrets": + return "secret" + case "persistentvolumeclaims": + return "persistentvolumeclaim" + case "persistentvolumes": + return "persistentvolume" + case "storageclasses": + return "storageclass" + case "horizontalpodautoscalers", "hpas", "hpa": + return "horizontalpodautoscaler" + case "poddisruptionbudgets": + return "poddisruptionbudget" + case "nodes": + return "node" + case "namespaces": + return "namespace" + case "events": + return "event" + } + return k +} + +// BuildIssueIndex composes the per-request issue index. NoLimit (not +// MaxLimit) is required here: a 5000-issue cluster would otherwise +// truncate after the first 1000 sorted rows, silently zeroing +// issueCount for resources whose issues fall in the tail. We're +// bucketing for a per-resource lookup, not paginating — the caller of +// the builder never sees the issue list itself. +// +// We rely on Filters.IncludeAudit and Filters.IncludeEvents staying +// false-by-default — that's what keeps the per-row count to "problem" +// + "condition" only. Audit + Warning events are loud and require +// explicit opt-in; rolling them into the per-row count would distort +// "this Pod has 1 issue" for the common case. +// +// No Kinds filter on Compose: the index buckets every composed row by +// (group, kind, ns, name), and the per-row lookup keys off +// issueIndexKey(...) with the same canonicalization, so kind-mismatched +// rows simply never read. Filtering Compose itself by Kind would need +// CRD-plural awareness — CanonicalSingular handles built-ins but +// returns CRD plurals (e.g. "applications") unchanged, and the issue +// engine emits "Application", silently zeroing issueCount on every CRD +// row. Bucketing is O(N) over the at-most-namespace-bounded issue set, +// which the consumer materialises anyway. +func BuildIssueIndex(p issues.Provider, namespaces []string) IssueIndex { + filters := issues.Filters{ + Namespaces: namespaces, + Limit: issues.NoLimit, + } + composed := issues.Compose(p, filters) + idx := make(IssueIndex, len(composed)) + for _, iss := range composed { + idx[issueIndexKey(iss.Group, iss.Kind, iss.Namespace, iss.Name)]++ + } + return idx +} + +// ManagedByFromRelationships extracts a compact ManagedByRef from +// computed topology relationships. Preference order: +// 1. Relationships.ManagedBy[0] — the server-synthesized topmost +// manager (ArgoCD Application > Flux Kustomization/HelmRelease > +// Helm release > topmost K8s owner). Walks the owner chain past +// ReplicaSets to the controlling Deployment in one shot. +// 2. Direct Owner — fallback for shapes ManagedBy synthesis declines +// (e.g. cluster-scoped roots where the topmost manager is the +// resource itself). +// +// Returns nil when topology has no relationship for the resource. +func ManagedByFromRelationships(rel *topology.Relationships) *resourcecontext.ManagedByRef { + if rel == nil { + return nil + } + if len(rel.ManagedBy) > 0 { + ref := rel.ManagedBy[0] + return resourcecontext.ManagedByFromOwner(ref.Kind, ref.Group, ref.Namespace, ref.Name) + } + if rel.Owner != nil { + return resourcecontext.ManagedByFromOwner(rel.Owner.Kind, rel.Owner.Group, rel.Owner.Namespace, rel.Owner.Name) + } + return nil +} diff --git a/internal/summarycontext/summarycontext_test.go b/internal/summarycontext/summarycontext_test.go new file mode 100644 index 000000000..f032e1f52 --- /dev/null +++ b/internal/summarycontext/summarycontext_test.go @@ -0,0 +1,376 @@ +// Pure-function tests for the shared summarycontext core. The +// REST/MCP-specific wiring tests (attachSummaryContextToList, +// dispatch-on-CanReadClusterScoped, the ai-handler issueIndexNamespaces +// helper) stay at their respective handler sites in internal/server +// and internal/mcp. + +package summarycontext + +import ( + "fmt" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/internal/issues" + "github.com/skyhook-io/radar/internal/k8s" + bp "github.com/skyhook-io/radar/pkg/audit" + "github.com/skyhook-io/radar/pkg/policyreports" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// fakeIssuesProvider is a minimal issues.Provider for the BuildIssueIndex +// tests. Only the fields the index path touches are wired. +// +// DetectProblems mirrors CacheProvider.DetectProblems: empty namespaces +// returns the full set; a non-empty slice drops cluster-scoped rows +// (Namespace=="") to match the production flattenNamespacedProblems +// behavior — needed so the cluster-scoped-filter regression test can +// pin the actual bug. +type fakeIssuesProvider struct { + problems []k8s.Problem +} + +func (f *fakeIssuesProvider) DetectProblems(namespaces []string) []k8s.Problem { + if len(namespaces) == 0 { + return f.problems + } + allowed := map[string]bool{} + for _, ns := range namespaces { + allowed[ns] = true + } + out := make([]k8s.Problem, 0, len(f.problems)) + for _, p := range f.problems { + if p.Namespace == "" { + continue + } + if allowed[p.Namespace] { + out = append(out, p) + } + } + return out +} +func (f *fakeIssuesProvider) DetectCAPIProblems(_ []string) []k8s.Problem { return nil } +func (f *fakeIssuesProvider) AuditFindings(_ []string) []bp.Finding { return nil } +func (f *fakeIssuesProvider) WarningEvents(_ []string, _ time.Duration) []*corev1.Event { + return nil +} +func (f *fakeIssuesProvider) WatchedDynamic() []schema.GroupVersionResource { return nil } +func (f *fakeIssuesProvider) ListDynamic(_ schema.GroupVersionResource, _ string) ([]*unstructured.Unstructured, error) { + return nil, nil +} +func (f *fakeIssuesProvider) KindForGVR(_ schema.GroupVersionResource) string { return "" } +func (f *fakeIssuesProvider) KyvernoFindings() []policyreports.SubjectFindings { return nil } +func (f *fakeIssuesProvider) KyvernoStatus() string { return "" } + +func fmtPodName(i int) string { return fmt.Sprintf("pod-%05d", i) } + +// TestIssueIndexKey_GroupAware pins that two resources sharing +// kind+namespace+name but in different API groups get independent +// counts. Without group in the key, e.g. Knative serving.knative.dev/ +// Service vs corev1 ""/Service collapse onto one bucket — and either +// the CRD inherits the core Service's count or vice versa. This breaks +// the moment a user has two operators each shipping a kind named +// "Cluster" in the same namespace. +func TestIssueIndexKey_GroupAware(t *testing.T) { + idx := IssueIndex{} + // Same kind+ns+name, different groups — must be independent buckets. + idx[issueIndexKey("", "Service", "prod", "api")] = 2 + idx[issueIndexKey("serving.knative.dev", "Service", "prod", "api")] = 5 + + if got := idx.Count("", "Service", "prod", "api"); got != 2 { + t.Errorf("core Service count = %d, want 2 (Knative bucket bleeding through?)", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 5 { + t.Errorf("Knative Service count = %d, want 5 (collided with core Service bucket?)", got) + } + // Wrong group lookup is a miss, not a fallback. + if got := idx.Count("example.io", "Service", "prod", "api"); got != 0 { + t.Errorf("unknown-group lookup = %d, want 0 (key should not coalesce across groups)", got) + } +} + +// TestBuildIssueIndex_GroupAware exercises the full BuildIssueIndex +// path with two CRDs that share kind+namespace+name but live in +// different API groups. Pre-fix, both rows landed under the same +// "service|prod|api" key and one inherited the other's count. +func TestBuildIssueIndex_GroupAware(t *testing.T) { + // Inject via a fake issues.Provider rather than the cache plumbing — + // keeps the test focused on the index-key arithmetic. + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Service", Group: "", Namespace: "prod", Name: "api", Reason: "Endpoints", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RevisionFailed", Severity: "warning"}, + {Kind: "Service", Group: "serving.knative.dev", Namespace: "prod", Name: "api", Reason: "RouteNotReady", Severity: "warning"}, + }, + } + idx := BuildIssueIndex(p, nil) + if got := idx.Count("", "Service", "prod", "api"); got != 1 { + t.Errorf("core Service count = %d, want 1", got) + } + if got := idx.Count("serving.knative.dev", "Service", "prod", "api"); got != 2 { + t.Errorf("Knative Service count = %d, want 2", got) + } +} + +// TestBuildIssueIndex_BeyondMaxLimit pins that resources whose issues +// would fall in the tail beyond MaxLimit still get correct issueCounts. +// Pre-fix, BuildIssueIndex passed Limit:MaxLimit (1000) to Compose; on +// a cluster with >1000 issues the post-sort truncation silently zeroed +// out counts for tail resources. The fix is Limit:NoLimit — the index +// is a bucketed count, not a paginated list. +func TestBuildIssueIndex_BeyondMaxLimit(t *testing.T) { + probs := make([]k8s.Problem, 0, issues.MaxLimit+50) + for i := 0; i < issues.MaxLimit+50; i++ { + probs = append(probs, k8s.Problem{ + Kind: "Pod", Namespace: "prod", Name: fmtPodName(i), Reason: "ImagePullBackOff", Severity: "warning", + }) + } + p := &fakeIssuesProvider{problems: probs} + idx := BuildIssueIndex(p, nil) + tailName := fmtPodName(issues.MaxLimit + 25) + if got := idx.Count("", "Pod", "prod", tailName); got != 1 { + t.Fatalf("tail pod %s count = %d, want 1 (silent MaxLimit truncation?)", tailName, got) + } + if got := idx.Count("", "Pod", "prod", fmtPodName(0)); got != 1 { + t.Errorf("head pod count = %d, want 1", got) + } +} + +// TestCanonicalSingular pins the kind normalization used to align URL +// plurals with the singular form the issue engine emits. +func TestCanonicalSingular(t *testing.T) { + cases := map[string]string{ + "pods": "pod", + "Pods": "pod", + "Deployment": "deployment", + "deployments": "deployment", + "hpa": "horizontalpodautoscaler", + "unknownkind": "unknownkind", + } + for in, want := range cases { + if got := CanonicalSingular(in); got != want { + t.Errorf("CanonicalSingular(%q) = %q, want %q", in, got, want) + } + } +} + +// TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered pins the +// end-to-end behavior: when the builder passes nil for the namespace +// filter (cluster-scoped kind), node-level issues at namespace="" +// surface in the index and the per-resource lookup returns the correct +// count. With a namespace filter populated, those same issues are +// dropped because Compose's per-namespace problem walk never sees them. +func TestBuildIssueIndex_ClusterScopedIssueSurfacedWhenUnfiltered(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + // Cluster-scoped Node issue: namespace="" — the actual shape + // k8s.DetectProblems emits for NodeNotReady / DiskPressure etc. + {Kind: "Node", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + }, + } + + // Cluster-wide compose (nil namespaces) — issue surfaces. + idx := BuildIssueIndex(p, nil) + if got := idx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("cluster-wide index: Node issueCount = %d, want 1 (cluster-scoped issue should appear)", got) + } + + // Namespace-scoped compose — same issue, but ns filter to + // ["prod","staging"] drops it because the user-namespaced perm + // slice never matches "". This is what the pre-fix handler did for + // Node lists. + scopedIdx := BuildIssueIndex(p, []string{"prod", "staging"}) + if got := scopedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespace-scoped index: Node issueCount = %d, want 0 (namespace filter drops cluster-scoped issue)", got) + } +} + +// TestBuildIssueIndex_CRDPlural_NonZeroCount pins the fix for a Bugbot +// finding on PR #722: a CRD listed by its plural form (e.g. +// "applications" for ArgoCD Application) silently returned +// issueCount=0 because BuildIssueIndex used to push the URL kind +// through CanonicalSingular into filters.Kinds. CanonicalSingular only +// covers built-in plurals — CRD plurals fell through unchanged +// ("applications" stayed "applications"), Compose's case-insensitive +// Kind filter then failed against the singular "Application" the +// issue engine emits, and every CRD row's count was zero. We dropped +// the Kinds filter entirely: bucketing by issueIndexKey(group, kind, +// ns, name) is already correct because the lookup side runs through +// CanonicalSingular too. Per-resource lookup uses the row's singular +// Kind (Pascal "Application") so the index and the query agree. +func TestBuildIssueIndex_CRDPlural_NonZeroCount(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Application", Group: "argoproj.io", Namespace: "argocd", Name: "storefront", Reason: "SyncFailed", Severity: "critical"}, + }, + } + + // Pre-fix simulation: the handler would have passed kindFilter="applications" + // — the URL plural. We no longer take a kindFilter, but verify that + // the index contains the row keyed by the canonical singular form. + idx := BuildIssueIndex(p, []string{"argocd"}) + if got := idx.Count("argoproj.io", "Application", "argocd", "storefront"); got != 1 { + t.Errorf("CRD Application count (singular kind) = %d, want 1", got) + } + // Also pin the URL-form lookup path: the per-row Builder is called + // with the kind as returned by MinifyUnstructured, which for CRDs + // is the singular ("Application"). If a caller ever pushed the + // plural ("applications") through Count(), CanonicalSingular won't + // normalize unknown CRD plurals — that's a separate latent issue + // that doesn't manifest today because the row source uses the + // singular. Document the asymmetry explicitly. + if got := idx.Count("argoproj.io", "applications", "argocd", "storefront"); got != 0 { + t.Errorf("CRD lookup via plural = %d, want 0 (CanonicalSingular only normalizes built-ins; row source uses singular Kind, so lookup matches via singular path)", got) + } +} + +// TestNewSearchSummaryContextBuilder_BuildsDualIndex pins the end-to-end +// shape used by /api/search and MCP search: scanNamespaces is non-nil +// (a namespace-restricted user, or a user with a `ns:` query modifier), +// so the constructor must compose TWO issue indexes — one scoped to +// those namespaces, one cluster-wide for cluster-scoped hits. Without +// the second index, the Node hit's summaryContext.issueCount returns +// 0 because every Node issue lives at namespace="" and the namespace +// filter drops them. +func TestNewSearchSummaryContextBuilder_BuildsDualIndex(t *testing.T) { + p := &fakeIssuesProvider{ + problems: []k8s.Problem{ + {Kind: "Node", Group: "", Namespace: "", Name: "worker-1", Reason: "NotReady", Severity: "critical"}, + {Kind: "Pod", Group: "", Namespace: "prod", Name: "api-7", Reason: "ImagePullBackOff", Severity: "warning"}, + }, + } + + // Build the two indexes the search constructor would build. + namespacedIdx := BuildIssueIndex(p, []string{"prod"}) + clusterIdx := BuildIssueIndex(p, nil) + + // Sanity: pre-fix, the search handler passed namespacedIdx for + // both; Node issueCount silently zeroed. + if got := namespacedIdx.Count("", "Node", "", "worker-1"); got != 0 { + t.Errorf("namespacedIdx Node count = %d, want 0 (sanity — namespace filter drops cluster-scoped issues)", got) + } + if got := clusterIdx.Count("", "Node", "", "worker-1"); got != 1 { + t.Errorf("clusterIdx Node count = %d, want 1 (cluster-wide compose surfaces namespace=\"\" issues)", got) + } + if got := namespacedIdx.Count("", "Pod", "prod", "api-7"); got != 1 { + t.Errorf("namespacedIdx Pod count = %d, want 1", got) + } + + // With both indexes built, the closure dispatches per-hit by + // scope. Replay the dispatch via the shared helper to pin the + // end-to-end shape. Topology is nil; managedBy is nil but + // issueCount dispatch is what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Node hit via builder: got %+v, want IssueCount=1 (was 0 pre-fix)", sc) + } + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 1 { + t.Errorf("Pod hit via builder: got %+v, want IssueCount=1", sc) + } +} + +// TestBuilderFromIndexes_DispatchesByScope pins the dual-index dispatch: +// cluster-scoped hits (Node, PV, …) read the cluster-wide index (where +// namespace="" issues live), namespaced hits (Pod, Deployment, …) read +// the namespace-scoped index. Without this dispatch, a search response +// that mixes Pods and Nodes silently zeros issueCount on the Node hits +// — the namespace-scoped index drops every namespace="" issue. +// +// A wiring inversion (cluster-scoped → namespaced index) would +// re-introduce the bug, so we additionally assert no cross-bucket leak. +func TestBuilderFromIndexes_DispatchesByScope(t *testing.T) { + // Build two distinct indexes so we can tell which one was consulted. + namespacedIdx := IssueIndex{} + namespacedIdx[issueIndexKey("", "Pod", "prod", "api-7")] = 4 + + clusterIdx := IssueIndex{} + clusterIdx[issueIndexKey("", "Node", "", "worker-1")] = 2 + + // Topology is nil — managedBy is nil but issueCount dispatch is + // what we're pinning here. + build := BuilderFromIndexes(nil, namespacedIdx, clusterIdx) + + // Cluster-scoped Node hit — must read clusterIdx. + if sc := build(nil, nil, "", "Node", "", "worker-1"); sc == nil || sc.IssueCount != 2 { + t.Errorf("Node hit: got %+v, want IssueCount=2 from clusterIdx", sc) + } + // Namespaced Pod hit — must read namespacedIdx. + if sc := build(nil, nil, "", "Pod", "prod", "api-7"); sc == nil || sc.IssueCount != 4 { + t.Errorf("Pod hit: got %+v, want IssueCount=4 from namespacedIdx", sc) + } + // A cluster-scoped hit whose name only lives in the namespaced + // index must return 0 (no cross-bucket leak). + if sc := build(nil, nil, "", "Node", "", "api-7"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Node hit using Pod-bucket name leaked count: %+v", sc) + } + // And a namespaced hit whose name only lives in the cluster index + // likewise returns 0. + if sc := build(nil, nil, "", "Pod", "prod", "worker-1"); sc != nil && sc.IssueCount != 0 { + t.Errorf("Pod hit using Node-bucket name leaked count: %+v", sc) + } +} + +// TestManagedByFromRelationships_PrefersManagedBy pins the topmost-manager +// shortcut: when topology has synthesized a ManagedBy chain (Pod → +// ReplicaSet → Deployment), the helper surfaces the Deployment, not the +// noisy hash-suffixed ReplicaSet that sits in Owner. +func TestManagedByFromRelationships_PrefersManagedBy(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Deployment", Namespace: "prod", Name: "api", Group: "apps"}, + }, + } + got := ManagedByFromRelationships(rel) + want := &resourcecontext.ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"} + if got == nil || got.Kind != want.Kind || got.Name != want.Name || got.Namespace != want.Namespace || got.Source != want.Source { + t.Errorf("got %#v, want %#v", got, want) + } +} + +// TestManagedByFromRelationships_FallsBackToOwner covers the case where +// topology synthesis declined ManagedBy (e.g. cluster-scoped roots) — +// we still surface the direct Owner so the row isn't context-less. +func TestManagedByFromRelationships_FallsBackToOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + } + got := ManagedByFromRelationships(rel) + if got == nil { + t.Fatalf("got nil, want Application ref") + } + if got.Source != "argocd" { + t.Errorf("Source = %q, want argocd", got.Source) + } +} + +// TestManagedByFromRelationships_ManagedByWinsOverOwner pins that when +// both ManagedBy and Owner are set, ManagedBy[0] takes precedence — the +// server-synthesized topmost-manager walk should never be shadowed by +// the direct owner ref left over for back-compat. +func TestManagedByFromRelationships_ManagedByWinsOverOwner(t *testing.T) { + rel := &topology.Relationships{ + Owner: &topology.ResourceRef{Kind: "ReplicaSet", Namespace: "prod", Name: "api-7d5", Group: "apps"}, + ManagedBy: []topology.ResourceRef{ + {Kind: "Application", Namespace: "argocd", Name: "storefront", Group: "argoproj.io"}, + }, + } + got := ManagedByFromRelationships(rel) + if got == nil || got.Kind != "Application" || got.Source != "argocd" { + t.Errorf("got %#v, want Application/argocd", got) + } +} + +func TestManagedByFromRelationships_NilSafe(t *testing.T) { + if got := ManagedByFromRelationships(nil); got != nil { + t.Errorf("nil rel: got %#v, want nil", got) + } + if got := ManagedByFromRelationships(&topology.Relationships{}); got != nil { + t.Errorf("empty rel: got %#v, want nil", got) + } +} diff --git a/pkg/ai/context/summary.go b/pkg/ai/context/summary.go index 47ece9d7f..fd27d4c6f 100644 --- a/pkg/ai/context/summary.go +++ b/pkg/ai/context/summary.go @@ -14,6 +14,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/skyhook-io/radar/pkg/resourcecontext" ) // ResourceSummary is the typed output for Summary-level minification. @@ -40,34 +42,42 @@ type ResourceSummary struct { Finalizers []string `json:"finalizers,omitempty"` // Type-specific fields (only populated when relevant) - Image string `json:"image,omitempty"` - Ports string `json:"ports,omitempty"` - Schedule string `json:"schedule,omitempty"` - Type string `json:"type,omitempty"` // Service type, Secret type - Selector string `json:"selector,omitempty"` - ClusterIP string `json:"clusterIP,omitempty"` - Hosts []string `json:"hosts,omitempty"` - Restarts int32 `json:"restarts,omitempty"` - Node string `json:"node,omitempty"` - Strategy string `json:"strategy,omitempty"` - Completions string `json:"completions,omitempty"` - Duration string `json:"duration,omitempty"` + Image string `json:"image,omitempty"` + Ports string `json:"ports,omitempty"` + Schedule string `json:"schedule,omitempty"` + Type string `json:"type,omitempty"` // Service type, Secret type + Selector string `json:"selector,omitempty"` + ClusterIP string `json:"clusterIP,omitempty"` + Hosts []string `json:"hosts,omitempty"` + Restarts int32 `json:"restarts,omitempty"` + Node string `json:"node,omitempty"` + Strategy string `json:"strategy,omitempty"` + Completions string `json:"completions,omitempty"` + Duration string `json:"duration,omitempty"` Suspended *bool `json:"suspended,omitempty"` Unschedulable *bool `json:"unschedulable,omitempty"` - Active int `json:"active,omitempty"` - Target string `json:"target,omitempty"` - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas int32 `json:"maxReplicas,omitempty"` - Current int32 `json:"current,omitempty"` - Desired int32 `json:"desired,omitempty"` - Roles []string `json:"roles,omitempty"` - Version string `json:"version,omitempty"` - Pressures []string `json:"pressures,omitempty"` - Keys []string `json:"keys,omitempty"` - StorageClass string `json:"storageClass,omitempty"` - Capacity string `json:"capacity,omitempty"` - AccessModes []string `json:"accessModes,omitempty"` - Owner string `json:"owner,omitempty"` + Active int `json:"active,omitempty"` + Target string `json:"target,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas int32 `json:"maxReplicas,omitempty"` + Current int32 `json:"current,omitempty"` + Desired int32 `json:"desired,omitempty"` + Roles []string `json:"roles,omitempty"` + Version string `json:"version,omitempty"` + Pressures []string `json:"pressures,omitempty"` + Keys []string `json:"keys,omitempty"` + StorageClass string `json:"storageClass,omitempty"` + Capacity string `json:"capacity,omitempty"` + AccessModes []string `json:"accessModes,omitempty"` + Owner string `json:"owner,omitempty"` + + // SummaryContext is the per-row enrichment attached by AI-facing list + // surfaces (REST /api/ai/resources/{kind}, MCP list_resources, search + // hits). Populated by handlers post-minify via resourcecontext.BuildSummary; + // nil when the caller opted out (?context=none) or when no fields apply. + // Type is resourcecontext.ResourceSummaryContext — the field name keeps + // the shorter "SummaryContext" form to match the wire JSON tag. + SummaryContext *resourcecontext.ResourceSummaryContext `json:"summaryContext,omitempty"` } // summarize dispatches to the appropriate per-type extractor and then diff --git a/pkg/resourcecontext/summary.go b/pkg/resourcecontext/summary.go new file mode 100644 index 000000000..15302bff0 --- /dev/null +++ b/pkg/resourcecontext/summary.go @@ -0,0 +1,214 @@ +package resourcecontext + +import ( + "strings" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// SummaryOptions configures the compact per-result enrichment produced by +// BuildSummary. All fields are pre-computed by the caller — this +// package never touches the issue engine, topology builder, or audit +// cache directly. Handlers in internal/* (REST list, MCP list_resources, +// search) walk the per-request topology + issue indexes once and pass +// the per-result digest in here. +type SummaryOptions struct { + // ManagedBy is the compact owner/GitOps pointer attached to the summary. + // Callers derive this from topology.Relationships via + // ManagedByFromOwner; nil leaves the field absent. + ManagedBy *ManagedByRef + + // IssueCount is the count of internal issue-engine findings scoped to + // the subject resource. Callers pre-compute a per-namespace index + // (e.g. via internal/issues.ComposeWithStats) once per request and + // pass the count in for each result. Zero omits the field. + IssueCount int + + // Health, when non-empty, overrides the derived health string. The + // default is computed from resource status via deriveHealth — Pod + // container readiness, replica-count workloads, and the standard + // Ready/Available condition on CRDs. Non-trivial kinds derive to "". + Health string +} + +// BuildSummary produces the compact per-result ResourceSummaryContext +// attached to list_resources, /api/ai/resources/{kind} list, and search +// hits. +// +// Tightly bounded — only the triage fields needed to choose a next hop. +// Returns nil when all three fields would be empty so callers can +// `omitempty` the entire object on bare results and keep the wire shape minimal. +func BuildSummary(obj runtime.Object, opts SummaryOptions) *ResourceSummaryContext { + health := opts.Health + if health == "" { + health = deriveHealth(obj) + } + if opts.ManagedBy == nil && health == "" && opts.IssueCount == 0 { + return nil + } + return &ResourceSummaryContext{ + ManagedBy: opts.ManagedBy, + Health: health, + IssueCount: opts.IssueCount, + } +} + +// ManagedByFromOwner assembles a compact ManagedByRef from raw owner +// fields (typically pulled out of topology.Relationships in the handler). +// Returns nil when ownerKind or ownerName is empty so callers don't +// have to guard the assignment. +// +// Source classification: +// - "argocd" for argoproj.io kinds (Application, ApplicationSet, Rollout) +// - "flux" for *.fluxcd.io kinds (Kustomization, HelmRelease, GitRepository, …) +// - "helm" for the native Helm release pseudo-owner (kind "HelmRelease" +// with no group — emitted by topology's detectManagedByFromMeta to +// distinguish from Flux's HelmRelease CR in helm.toolkit.fluxcd.io) +// - "native" for everything else (Deployment, StatefulSet, DaemonSet, ReplicaSet, Job, …) +func ManagedByFromOwner(ownerKind, ownerGroup, ownerNamespace, ownerName string) *ManagedByRef { + if ownerKind == "" || ownerName == "" { + return nil + } + return &ManagedByRef{ + Kind: ownerKind, + Source: sourceForOwner(ownerKind, ownerGroup), + Name: ownerName, + Namespace: ownerNamespace, + } +} + +func sourceForOwner(ownerKind, group string) string { + // Native Helm install: topology synthesizes a {Kind:"HelmRelease", Group:""} + // pseudo-owner from Helm's release-name/namespace annotations. This must + // be classified BEFORE the group-based GitOps branches so we don't fall + // through to "native" — Flux's HelmRelease lives at helm.toolkit.fluxcd.io + // and is handled by the *.fluxcd.io branch below. + if ownerKind == "HelmRelease" && group == "" { + return "helm" + } + switch group { + case "argoproj.io": + return "argocd" + } + if strings.HasSuffix(group, ".fluxcd.io") { + return "flux" + } + return "native" +} + +// deriveHealth applies a tiny per-kind heuristic to classify a resource +// as "healthy" | "degraded" | "unhealthy". Kinds we don't recognize +// derive to "" and the field is omitted on the wire. +// +// Vocabulary matches the broader status-tone scheme used across the UI +// (k8s-ui StatusTone) so consumers don't need to translate. +func deriveHealth(obj runtime.Object) string { + if obj == nil { + return "" + } + switch o := obj.(type) { + case *corev1.Pod: + return podHealth(o) + case *appsv1.Deployment: + // Use Spec.Replicas (desired) not Status.Replicas (current). During + // scale-down or rolling updates, Status.Replicas can exceed + // Spec.Replicas while terminating pods drain; comparing ReadyReplicas + // against Status.Replicas would falsely report "degraded" when all + // desired replicas are actually ready. Matches StatefulSet semantics. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *appsv1.StatefulSet: + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *appsv1.DaemonSet: + return replicasHealth(o.Status.NumberReady, o.Status.DesiredNumberScheduled) + case *appsv1.ReplicaSet: + // Same Spec-vs-Status concern as Deployment above. + desired := int32(1) + if o.Spec.Replicas != nil { + desired = *o.Spec.Replicas + } + return replicasHealth(o.Status.ReadyReplicas, desired) + case *unstructured.Unstructured: + return unstructuredHealth(o) + } + return "" +} + +func podHealth(p *corev1.Pod) string { + switch p.Status.Phase { + case corev1.PodRunning: + if len(p.Status.ContainerStatuses) == 0 { + return "degraded" + } + for _, cs := range p.Status.ContainerStatuses { + if !cs.Ready { + return "degraded" + } + } + return "healthy" + case corev1.PodSucceeded: + return "healthy" + case corev1.PodFailed: + return "unhealthy" + case corev1.PodPending: + return "degraded" + } + return "" +} + +func replicasHealth(ready, desired int32) string { + if desired <= 0 { + return "" + } + if ready >= desired { + return "healthy" + } + if ready <= 0 { + return "unhealthy" + } + return "degraded" +} + +// unstructuredHealth derives health for CRDs that follow the standard +// Ready/Available condition pattern. Returns "" for kinds without a +// matching condition so we don't emit a misleading status for resources +// whose status shape we don't understand. +func unstructuredHealth(u *unstructured.Unstructured) string { + if u == nil { + return "" + } + conditions, found, _ := unstructured.NestedSlice(u.Object, "status", "conditions") + if !found || len(conditions) == 0 { + return "" + } + for _, c := range conditions { + cond, ok := c.(map[string]any) + if !ok { + continue + } + condType, _ := cond["type"].(string) + if condType != "Ready" && condType != "Available" { + continue + } + status, _ := cond["status"].(string) + switch status { + case "True": + return "healthy" + case "False": + return "unhealthy" + default: + return "degraded" + } + } + return "" +} diff --git a/pkg/resourcecontext/summary_test.go b/pkg/resourcecontext/summary_test.go new file mode 100644 index 000000000..96c4979be --- /dev/null +++ b/pkg/resourcecontext/summary_test.go @@ -0,0 +1,391 @@ +package resourcecontext + +import ( + "encoding/json" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +// TestBuildSummary_NilWhenEmpty pins that BuildSummary returns nil when +// every field would be empty — keeps the per-row JSON minimal. +func TestBuildSummary_NilWhenEmpty(t *testing.T) { + // ConfigMap has no health heuristic and no caller-supplied options. + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "x", Namespace: "y"}} + if got := BuildSummary(cm, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(ConfigMap, {}) = %#v, want nil", got) + } +} + +// TestBuildSummary_PodGoldens golden-files BuildSummary across the +// Pod phases that drive the health heuristic. Locks the wire shape +// for the common "list pods" call. +func TestBuildSummary_PodGoldens(t *testing.T) { + cases := []struct { + name string + pod *corev1.Pod + opts SummaryOptions + want string + }{ + { + name: "running_all_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + {Name: "sidecar", Ready: true}, + }, + }, + }, + want: `{"health":"healthy"}`, + }, + { + name: "running_one_not_ready", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: false}, + }, + }, + }, + want: `{"health":"degraded"}`, + }, + { + name: "failed", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "pending", + pod: &corev1.Pod{ + Status: corev1.PodStatus{Phase: corev1.PodPending}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "running_with_issues_and_managedby", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true}, + }, + }, + }, + opts: SummaryOptions{ + ManagedBy: ManagedByFromOwner("ReplicaSet", "apps", "prod", "api-7d5"), + IssueCount: 2, + }, + want: `{"managedBy":{"kind":"ReplicaSet","source":"native","name":"api-7d5","namespace":"prod"},"health":"healthy","issueCount":2}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := BuildSummary(c.pod, c.opts) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, err := json.Marshal(got) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_DeploymentReplicasHealth covers the replica-driven +// health heuristic across the Deployment cases. +func TestBuildSummary_DeploymentReplicasHealth(t *testing.T) { + cases := []struct { + name string + ready int32 + desired int32 + wantSlice []byte // JSON of BuildSummary output + }{ + {"all_ready", 3, 3, []byte(`{"health":"healthy"}`)}, + {"none_ready", 0, 3, []byte(`{"health":"unhealthy"}`)}, + {"partial", 1, 3, []byte(`{"health":"degraded"}`)}, + {"scaled_to_zero", 0, 0, nil}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + desired := c.desired + dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &desired, // desired is Spec.Replicas (not Status) — see deriveHealth + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: c.ready, + // Status.Replicas mirrors the actual non-terminated pod count + // in real clusters; we set it equal to ready here so the + // fixture matches a steady-state Deployment for that test. + Replicas: c.ready, + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if c.wantSlice == nil { + if got != nil { + t.Fatalf("got %#v, want nil", got) + } + return + } + if got == nil { + t.Fatalf("got nil, want %s", c.wantSlice) + } + b, _ := json.Marshal(got) + if string(b) != string(c.wantSlice) { + t.Errorf("got %s\nwant %s", b, c.wantSlice) + } + }) + } +} + +// TestBuildSummary_DeploymentHealthDuringScaleDown pins the Spec-vs-Status +// regression flagged on PR #722: during rolling updates or scale-down, +// Status.Replicas (current pod count) can exceed Spec.Replicas (desired). +// Before the fix, deriveHealth compared ReadyReplicas against Status.Replicas +// and reported "degraded" because not all current pods were ready — even +// though all DESIRED replicas were ready and the cluster was healthily +// draining excess pods. Use Spec.Replicas as the denominator instead. +func TestBuildSummary_DeploymentHealthDuringScaleDown(t *testing.T) { + desired := int32(2) + dep := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{Replicas: &desired}, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 2, // all DESIRED replicas are ready + Replicas: 4, // but 2 extras still terminating from a scale-down + }, + } + got := BuildSummary(dep, SummaryOptions{}) + if got == nil { + t.Fatal("got nil, want ResourceSummaryContext with health=healthy") + } + if got.Health != "healthy" { + t.Errorf("Health = %q, want %q (Spec.Replicas=2 ready, Status.Replicas=4 due to draining)", got.Health, "healthy") + } +} + +// TestBuildSummary_ReplicaSetHealthDuringScaleDown pins the same fix for +// ReplicaSet — the Deployment regression also applied here. +func TestBuildSummary_ReplicaSetHealthDuringScaleDown(t *testing.T) { + desired := int32(3) + rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{Replicas: &desired}, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 3, + Replicas: 5, + }, + } + got := BuildSummary(rs, SummaryOptions{}) + if got == nil || got.Health != "healthy" { + t.Errorf("ReplicaSet during scale-down: got %+v, want Health=healthy", got) + } +} + +// TestBuildSummary_NetworkPolicy verifies BuildSummary handles a kind +// without a health heuristic — it should only emit fields the caller +// supplied (e.g. issueCount, managedBy) and skip health entirely. +func TestBuildSummary_NetworkPolicy(t *testing.T) { + np := &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "deny-all", Namespace: "prod"}, + } + // Empty opts → nil; the kind has no health heuristic so no field is set. + if got := BuildSummary(np, SummaryOptions{}); got != nil { + t.Fatalf("got %#v, want nil", got) + } + // IssueCount only → summary with just issueCount. + got := BuildSummary(np, SummaryOptions{IssueCount: 3}) + if got == nil { + t.Fatalf("got nil, want summary with issueCount") + } + b, _ := json.Marshal(got) + want := `{"issueCount":3}` + if string(b) != want { + t.Errorf("got %s\nwant %s", b, want) + } +} + +// TestBuildSummary_UnstructuredReadyCondition covers the CRD fallback +// — Ready/Available conditions are translated to the health vocabulary. +func TestBuildSummary_UnstructuredReadyCondition(t *testing.T) { + cases := []struct { + name string + conditions []any + want string + }{ + { + name: "ready_true", + conditions: []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + { + name: "ready_false", + conditions: []any{ + map[string]any{"type": "Ready", "status": "False"}, + }, + want: `{"health":"unhealthy"}`, + }, + { + name: "ready_unknown", + conditions: []any{ + map[string]any{"type": "Ready", "status": "Unknown"}, + }, + want: `{"health":"degraded"}`, + }, + { + name: "available_true", + conditions: []any{ + map[string]any{"type": "Available", "status": "True"}, + }, + want: `{"health":"healthy"}`, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + u := &unstructured.Unstructured{Object: map[string]any{ + "apiVersion": "example.io/v1", + "kind": "Widget", + "status": map[string]any{"conditions": c.conditions}, + }} + got := BuildSummary(u, SummaryOptions{}) + if got == nil { + t.Fatalf("got nil, want %s", c.want) + } + b, _ := json.Marshal(got) + if string(b) != c.want { + t.Errorf("got %s\nwant %s", b, c.want) + } + }) + } +} + +// TestBuildSummary_HealthOverride pins that caller-supplied Health +// short-circuits the per-kind heuristic. +func TestBuildSummary_HealthOverride(t *testing.T) { + dep := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ReadyReplicas: 3, Replicas: 3}, + } + got := BuildSummary(dep, SummaryOptions{Health: "degraded"}) + if got == nil || got.Health != "degraded" { + t.Fatalf("Health override ignored: %#v", got) + } +} + +// TestManagedByFromOwner pins source classification for each cluster +// of owner kinds we care about. +func TestManagedByFromOwner(t *testing.T) { + cases := []struct { + name string + kind string + group string + namespace string + ownerName string + want *ManagedByRef + }{ + { + name: "empty_kind", + kind: "", + ownerName: "x", + want: nil, + }, + { + name: "empty_name", + kind: "Deployment", + ownerName: "", + want: nil, + }, + { + name: "deployment", + kind: "Deployment", + group: "apps", + namespace: "prod", + ownerName: "api", + want: &ManagedByRef{Kind: "Deployment", Source: "native", Name: "api", Namespace: "prod"}, + }, + { + name: "argocd_application", + kind: "Application", + group: "argoproj.io", + namespace: "argocd", + ownerName: "storefront", + want: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, + }, + { + name: "flux_kustomization", + kind: "Kustomization", + group: "kustomize.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_helmrelease", + kind: "HelmRelease", + group: "helm.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "prod-apps", + want: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}, + }, + { + name: "flux_gitrepository", + kind: "GitRepository", + group: "source.toolkit.fluxcd.io", + namespace: "flux-system", + ownerName: "repo", + want: &ManagedByRef{Kind: "GitRepository", Source: "flux", Name: "repo", Namespace: "flux-system"}, + }, + { + // Native Helm release: topology's detectManagedByFromMeta emits + // {Kind:"HelmRelease", Group:""} when it sees Helm's release-name + // annotation (no Flux/GitOps signal). Must classify as "helm", + // not "native" — distinguishes Helm-managed resources in the + // list/search UI from raw kubectl-applied ones. The Flux + // HelmRelease CR lives at helm.toolkit.fluxcd.io and is covered + // by the case above; the empty-group form is unambiguous. + name: "native_helm_release", + kind: "HelmRelease", + group: "", + namespace: "cert-manager", + ownerName: "cert-manager", + want: &ManagedByRef{Kind: "HelmRelease", Source: "helm", Name: "cert-manager", Namespace: "cert-manager"}, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := ManagedByFromOwner(c.kind, c.group, c.namespace, c.ownerName) + if !reflect.DeepEqual(got, c.want) { + t.Errorf("ManagedByFromOwner(%q, %q, %q, %q) = %#v, want %#v", + c.kind, c.group, c.namespace, c.ownerName, got, c.want) + } + }) + } +} + +// TestBuildSummary_NilObject defends against the typed-nil-in-interface +// trap: handlers occasionally pass interface-wrapped nils. +func TestBuildSummary_NilObject(t *testing.T) { + var obj runtime.Object + if got := BuildSummary(obj, SummaryOptions{}); got != nil { + t.Fatalf("BuildSummary(nil) = %#v, want nil", got) + } + // IssueCount alone still produces output (no panic via nil obj). + got := BuildSummary(obj, SummaryOptions{IssueCount: 1}) + if got == nil || got.IssueCount != 1 { + t.Fatalf("BuildSummary(nil, IssueCount=1) = %#v, want issueCount=1", got) + } +} diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 6e98d0bec..235ed2749 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -64,20 +64,23 @@ type ContextRef struct { } // ManagedByRef is the compact form of a "managed-by" pointer used in -// SummaryContext (list/search rows). Carries Kind alongside Source so +// ResourceSummaryContext (list/search rows). Carries Kind alongside Source so // consumers can distinguish e.g. a Flux Kustomization from a Flux // HelmRelease without re-parsing the Source string. Intentionally lacks // Group to keep per-row bytes minimal. type ManagedByRef struct { - Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … - Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" + Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … + Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" Name string `json:"name"` Namespace string `json:"namespace,omitempty"` } -// SummaryContext is the per-row enrichment attached to list_resources -// and search hits. Always-on, intentionally minimal (≤ ~60 bytes). -type SummaryContext struct { +// ResourceSummaryContext is the per-row enrichment attached to +// list_resources and search hits. The row-tier companion to +// ResourceContext (the detail-tier enrichment on GET responses) — +// optimised for bulk triage on lists at ≤ ~60 bytes per row. Always-on +// when the caller didn't opt out via context=none. +type ResourceSummaryContext struct { ManagedBy *ManagedByRef `json:"managedBy,omitempty"` Health string `json:"health,omitempty"` IssueCount int `json:"issueCount,omitempty"` diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index f8174fe3e..ccb22f389 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -214,10 +214,10 @@ func TestResourceContextRoundTrip(t *testing.T) { } } -// TestSummaryContextRoundTrip covers SummaryContext + ManagedByRef +// TestResourceSummaryContextRoundTrip covers ResourceSummaryContext + ManagedByRef // which are not embedded in ResourceContext. -func TestSummaryContextRoundTrip(t *testing.T) { - orig := SummaryContext{ +func TestResourceSummaryContextRoundTrip(t *testing.T) { + orig := ResourceSummaryContext{ ManagedBy: &ManagedByRef{Kind: "Application", Source: "argocd", Name: "storefront", Namespace: "argocd"}, Health: "degraded", IssueCount: 2, @@ -226,7 +226,7 @@ func TestSummaryContextRoundTrip(t *testing.T) { if err != nil { t.Fatalf("marshal: %v", err) } - var got SummaryContext + var got ResourceSummaryContext if err := json.Unmarshal(b, &got); err != nil { t.Fatalf("unmarshal: %v", err) } @@ -241,12 +241,12 @@ func TestSummaryContextRoundTrip(t *testing.T) { s := string(b) for _, sub := range wantSubstr { if !strings.Contains(s, sub) { - t.Errorf("SummaryContext JSON missing %s: %s", sub, s) + t.Errorf("ResourceSummaryContext JSON missing %s: %s", sub, s) } } for _, forbidden := range []string{`"group"`} { if strings.Contains(s, forbidden) { - t.Errorf("SummaryContext JSON leaks %s: %s", forbidden, s) + t.Errorf("ResourceSummaryContext JSON leaks %s: %s", forbidden, s) } } } @@ -255,8 +255,8 @@ func TestSummaryContextRoundTrip(t *testing.T) { // without it, Flux Kustomization vs HelmRelease serialize to identical // JSON, forcing consumers to parse the Source string. func TestManagedByRefDistinguishesFluxKinds(t *testing.T) { - kustomization := SummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} - helmRelease := SummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + kustomization := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "Kustomization", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} + helmRelease := ResourceSummaryContext{ManagedBy: &ManagedByRef{Kind: "HelmRelease", Source: "flux", Name: "prod-apps", Namespace: "flux-system"}} kJSON, _ := json.Marshal(kustomization) hJSON, _ := json.Marshal(helmRelease)