Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
ef2daf9
feat(resourcecontext): attach summaryContext to list_resources + sear…
nadaverell May 17, 2026
32d72d4
feat(resourcecontext): consolidate summary builder on T23 (ManagedBy …
nadaverell May 17, 2026
d695115
fix(summaryContext): group-aware issue index, uncapped count, native …
nadaverell May 17, 2026
1685c30
fix(security+correctness): preflight RBAC on AI list + populate Probl…
nadaverell May 17, 2026
ca7a044
perf(summaryContext): memoize MCP topology builds + fix misleading fi…
nadaverell May 17, 2026
7497611
fix(summaryContext): cluster-scoped issueCount + CRD scan perf
nadaverell May 17, 2026
4e859bc
fix(summaryContext+ai-list): dual-index search + group-aware list rou…
nadaverell May 17, 2026
b728824
fix(summaryContext): use Spec.Replicas + defer SummaryBuilder past tr…
nadaverell May 18, 2026
fc47ec2
chore(search): drop dead summaryBuilder parameter from buildHit
nadaverell May 18, 2026
fa66d3e
fix(resourcecontext): restore SummaryContext type after stray rename
nadaverell May 18, 2026
1d263ed
refactor(summaryContext): lift shared core into internal/summarycontext
nadaverell May 18, 2026
e6c1df0
refactor(resourcecontext): rename SummaryContext → ResourceSummaryCon…
nadaverell May 18, 2026
a75e2f9
refactor(summarycontext): centralize attach helpers + group extractors
nadaverell May 18, 2026
441fd2c
fix(summarycontext): drop kindFilter so CRD plurals don't zero issueC…
nadaverell May 18, 2026
9878009
test: extend fakeIssuesProvider with Kyverno methods for post-T11 Pro…
nadaverell May 18, 2026
8ef09d8
fix: rebase fallout — restore dropped runtime import in ai_handlers.go
nadaverell May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions internal/issues/issues.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,17 @@ func Compose(p Provider, f Filters) []Issue {
// severity desc, then last-seen desc, then kind/ns/name for stable
// tiebreaks.
func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) {
// Negative Limit is the "uncapped" sentinel: callers that need the
// full matched set (per-resource issue indexes for /api/ai list +
// search summaryContext) pass NoLimit so a 5000-issue cluster
// doesn't silently drop counts for resources whose issues fall in
// the tail beyond MaxLimit. Zero still maps to DefaultLimit so the
// public /api/issues + MCP issues_list keep their tight caps.
uncapped := f.Limit < 0
if f.Limit == 0 {
f.Limit = DefaultLimit
}
if f.Limit > MaxLimit {
if !uncapped && f.Limit > MaxLimit {
f.Limit = MaxLimit
}

Expand Down Expand Up @@ -201,7 +208,7 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) {
return out[i].Name < out[j].Name
})
stats.TotalMatched = len(out)
if len(out) > f.Limit {
if !uncapped && len(out) > f.Limit {
out = out[:f.Limit]
}
return out, stats
Expand All @@ -211,11 +218,24 @@ func ComposeWithStats(p Provider, f Filters) ([]Issue, ComposeStats) {
// warning Issue for each object that has a False Ready/Available/etc.
// condition. Skips kinds owned by curated checkers (Cluster API today)
// to avoid double-reporting.
//
// When f.Kinds is non-empty (e.g. summaryContext building a per-resource
// issue index for a list_resources call on a single kind), GVRs whose
// kind isn't in the filter are skipped BEFORE the ListDynamic call —
// without this gate, a pods-only request still scanned every watched
// CRD up front and applyFilters discarded the rows afterward. Kind
// comparison mirrors applyFilters: lowercase for case-insensitive
// match against the user's filter (which itself is canonicalized to
// the singular form upstream).
func detectGenericCRDIssues(p Provider, f Filters) []Issue {
gvrs := p.WatchedDynamic()
if len(gvrs) == 0 {
return nil
}
wantKind := map[string]bool{}
for _, k := range f.Kinds {
wantKind[strings.ToLower(k)] = true
}
var out []Issue
for _, gvr := range gvrs {
if isCuratedCRDGroup(gvr.Group) {
Expand All @@ -225,6 +245,15 @@ func detectGenericCRDIssues(p Provider, f Filters) []Issue {
if kind == "" {
continue
}
// applyFilters runs after Compose returns — but on hot paths that
// pin a single kind (summaryContext per-row index), routing the
// kind filter through here skips the per-GVR ListDynamic call
// entirely. Match in lowercase (same as applyFilters) so
// "Pod"/"pod" and CRD-typed "MyResource"/"myresource" both
// compare equal.
if len(wantKind) > 0 && !wantKind[strings.ToLower(kind)] {
continue
}
clusterScoped, _, _ := classifyDynamicScope(p, gvr, kind)
if clusterScoped && f.CanReadClusterScoped != nil && !f.CanReadClusterScoped(kind, gvr.Group) {
continue
Expand Down
84 changes: 84 additions & 0 deletions internal/issues/issues_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,3 +622,87 @@ func TestFlattenNamespacedProblems_EmptyInputReturnsNil(t *testing.T) {
t.Errorf("empty input should produce empty output, got %+v", out)
}
}

// countingProvider wraps fakeProvider and tallies ListDynamic calls per
// GVR. Used by TestDetectGenericCRDIssues_SkipsListWhenKindFiltered to
// pin that detectGenericCRDIssues short-circuits the per-GVR
// ListDynamic call when f.Kinds excludes the GVR's kind — on clusters
// with hundreds of watched CRDs, scanning every one for a pods-only
// summaryContext request was the dominant cost.
type countingProvider struct {
fakeProvider
listCalls map[schema.GroupVersionResource]int
}

func (c *countingProvider) ListDynamic(gvr schema.GroupVersionResource, ns string) ([]*unstructured.Unstructured, error) {
if c.listCalls == nil {
c.listCalls = map[schema.GroupVersionResource]int{}
}
c.listCalls[gvr]++
return c.fakeProvider.ListDynamic(gvr, ns)
}

// TestDetectGenericCRDIssues_SkipsListWhenKindFiltered pins the
// "scan all CRDs before kindFilter applies" perf fix in
// detectGenericCRDIssues. Pre-fix, a Compose call with Kinds=["Pod"]
// still iterated every watched CRD GVR and ran ListDynamic on each;
// applyFilters then discarded the non-matching rows at the end.
//
// On a cluster with hundreds of watched CRDs this dominated the
// summaryContext per-row index build for list_resources kind=pods.
// The fix routes f.Kinds awareness into detectGenericCRDIssues so
// non-matching GVRs skip the ListDynamic call entirely.
func TestDetectGenericCRDIssues_SkipsListWhenKindFiltered(t *testing.T) {
podGVR := schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"}
appGVR := schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "applications"}
npGVR := schema.GroupVersionResource{Group: "karpenter.sh", Version: "v1", Resource: "nodepools"}

p := &countingProvider{
fakeProvider: fakeProvider{
dynamic: map[schema.GroupVersionResource][]*unstructured.Unstructured{
podGVR: {}, // empty — only counts the call.
appGVR: {{Object: map[string]any{
"metadata": map[string]any{"name": "a", "namespace": "argocd"},
"status": map[string]any{
"conditions": []any{
map[string]any{"type": "Synced", "status": "False", "reason": "Drift"},
},
},
}}},
npGVR: {}, // empty — only counts the call.
},
kinds: map[schema.GroupVersionResource]string{
podGVR: "Pod",
appGVR: "Application",
npGVR: "NodePool",
},
},
}

// kindFilter restricts to Application — the other two GVRs must NOT
// be listed. detectGenericCRDIssues lowercases the kind comparison
// (mirrors applyFilters), so the canonical "Application" matches the
// emitted Kind for the argoproj.io GVR.
_ = detectGenericCRDIssues(p, Filters{Kinds: []string{"Application"}})

if got := p.listCalls[podGVR]; got != 0 {
t.Errorf("Pod GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got)
}
if got := p.listCalls[npGVR]; got != 0 {
t.Errorf("NodePool GVR ListDynamic calls = %d, want 0 (kind filter must skip non-matching GVRs)", got)
}
if got := p.listCalls[appGVR]; got == 0 {
t.Errorf("Application GVR ListDynamic calls = %d, want >= 1 (matching kind must still be scanned)", got)
}

// Sanity: empty Kinds filter scans every GVR (no per-kind shortcut
// when caller didn't ask for one). Pins that the fix is filter-aware
// rather than always-skip.
p.listCalls = nil
_ = detectGenericCRDIssues(p, Filters{})
for gvr, want := range map[schema.GroupVersionResource]bool{podGVR: true, appGVR: true, npGVR: true} {
if got := p.listCalls[gvr] > 0; got != want {
t.Errorf("no kind filter: GVR %s called=%v, want %v", gvr.Resource, got, want)
}
}
}
7 changes: 7 additions & 0 deletions internal/issues/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,11 @@ type Filters struct {
const (
DefaultLimit = 200
MaxLimit = 1000
// NoLimit disables the result cap. Pass as Filters.Limit when the
// caller needs the full matched set (e.g. building a per-resource
// issue index for summaryContext — capping there would silently zero
// out counts for resources whose issues fall in the tail beyond
// MaxLimit on large clusters). Stats.TotalMatched is reliable
// regardless; this just turns off the post-sort slice.
NoLimit = -1
)
7 changes: 7 additions & 0 deletions internal/k8s/problems.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "Deployment",
Namespace: d.Namespace,
Name: d.Name,
Group: "apps",
Severity: "critical",
Reason: fmt.Sprintf("%d/%d available", d.Status.AvailableReplicas, d.Status.Replicas),
Age: FormatAge(ageDur),
Expand All @@ -78,6 +79,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "Deployment",
Namespace: d.Namespace,
Name: d.Name,
Group: "apps",
Severity: "critical",
Reason: "Rollout stuck",
Message: cond.Message,
Expand Down Expand Up @@ -107,6 +109,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "StatefulSet",
Namespace: ss.Namespace,
Name: ss.Name,
Group: "apps",
Severity: "critical",
Reason: fmt.Sprintf("%d/%d ready", ss.Status.ReadyReplicas, ss.Status.Replicas),
Age: FormatAge(ageDur),
Expand All @@ -133,6 +136,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "DaemonSet",
Namespace: ds.Namespace,
Name: ds.Name,
Group: "apps",
Severity: "critical",
Reason: fmt.Sprintf("%d unavailable", ds.Status.NumberUnavailable),
Age: FormatAge(ageDur),
Expand All @@ -157,6 +161,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "HorizontalPodAutoscaler",
Namespace: hp.Namespace,
Name: hp.Name,
Group: "autoscaling",
Severity: "medium",
Reason: hp.Problem,
Message: hp.Reason,
Expand All @@ -177,6 +182,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "CronJob",
Namespace: cp.Namespace,
Name: cp.Name,
Group: "batch",
Severity: "medium",
Reason: cp.Problem,
Message: cp.Reason,
Expand Down Expand Up @@ -251,6 +257,7 @@ func DetectProblems(cache *ResourceCache, namespace string) []Problem {
Kind: "Job",
Namespace: job.Namespace,
Name: job.Name,
Group: "batch",
Severity: "high",
Reason: fmt.Sprintf("Running for %s with no completions", FormatAge(ageDur)),
Age: FormatAge(ageDur),
Expand Down
144 changes: 144 additions & 0 deletions internal/k8s/problems_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package k8s

import (
"testing"
"time"

appsv1 "k8s.io/api/apps/v1"
autoscalingv2 "k8s.io/api/autoscaling/v2"
batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
)

// TestDetectProblems_PopulatesGroup pins that every built-in Problem
// emitted by DetectProblems carries the correct canonical API group.
//
// The summary_context issue index keys per-resource counts as
// "group|kind|ns|name" — a Problem with an empty Group collides with
// no real bucket, silently zeroing issueCount for that workload row.
// Pre-fix, all the built-in append-Problem sites omitted the field, so
// every broken Deployment/StatefulSet/DaemonSet/HPA/CronJob/Job
// reported issueCount: 0 in the AI list envelope — a regression
// against the pre-group-aware behavior.
//
// Construct one broken object per built-in kind, drive DetectProblems
// against a fake client, and assert each emitted Problem's Group
// matches the canonical group for its kind.
func TestDetectProblems_PopulatesGroup(t *testing.T) {
defer ResetTestState()

oneReplica := int32(1)
minReplicas := int32(1)
now := time.Now()
// Job needs to be older than 1h to surface a "stuck" problem.
jobStart := metav1.NewTime(now.Add(-2 * time.Hour))

client := fake.NewClientset(
// Deployment with unavailable replicas — triggers the
// "X/Y available" Problem branch.
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"},
Spec: appsv1.DeploymentSpec{Replicas: &oneReplica},
Status: appsv1.DeploymentStatus{
Replicas: 1,
UnavailableReplicas: 1,
},
},
// StatefulSet with readyReplicas < replicas.
&appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "prod"},
Spec: appsv1.StatefulSetSpec{Replicas: &oneReplica},
Status: appsv1.StatefulSetStatus{
Replicas: 1,
ReadyReplicas: 0,
},
},
// DaemonSet with numberUnavailable > 0.
&appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Name: "logger", Namespace: "prod"},
Status: appsv1.DaemonSetStatus{
NumberUnavailable: 2,
},
},
// HPA at its replica ceiling — DetectHPAProblems flags
// "maxed" when current and desired both hit MaxReplicas.
// The wrapper sets Group="autoscaling".
&autoscalingv2.HorizontalPodAutoscaler{
ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
MinReplicas: &minReplicas,
MaxReplicas: 10,
},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 10,
DesiredReplicas: 10,
},
},
// Job stuck Active>0 for >1h with no completions.
&batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "migrate", Namespace: "prod", CreationTimestamp: jobStart},
Status: batchv1.JobStatus{
Active: 1,
Succeeded: 0,
Failed: 0,
},
},
)

if err := InitTestResourceCache(client); err != nil {
t.Fatalf("InitTestResourceCache: %v", err)
}
cache := GetResourceCache()
if cache == nil {
t.Fatal("cache nil after init")
}

// Allow informers a brief moment to populate. The fake clientset
// pre-seeds the store, but the lister types reconstruct via
// informer events on a separate goroutine.
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
if hasAllProblemTypes(DetectProblems(cache, "prod")) {
break
}
time.Sleep(20 * time.Millisecond)
}

problems := DetectProblems(cache, "prod")

wantGroup := map[string]string{
"Deployment": "apps",
"StatefulSet": "apps",
"DaemonSet": "apps",
"HorizontalPodAutoscaler": "autoscaling",
"Job": "batch",
}

got := make(map[string]string, len(problems))
for _, p := range problems {
// One Problem per kind is enough for the Group assertion;
// duplicates (e.g. Deployment Available + ProgressDeadline)
// must agree on Group so the last-write-wins shape is fine.
got[p.Kind] = p.Group
}

for kind, want := range wantGroup {
gotGroup, ok := got[kind]
if !ok {
t.Errorf("no Problem emitted for %s — fixture wiring broken; got %d problems: %+v", kind, len(problems), problems)
continue
}
if gotGroup != want {
t.Errorf("%s.Group = %q, want %q (summary_context index keys by group — empty Group zeros issueCount)", kind, gotGroup, want)
}
}
}

func hasAllProblemTypes(problems []Problem) bool {
seen := map[string]bool{}
for _, p := range problems {
seen[p.Kind] = true
}
return seen["Deployment"] && seen["StatefulSet"] && seen["DaemonSet"] && seen["HorizontalPodAutoscaler"] && seen["Job"]
}
Loading
Loading