diff --git a/internal/k8s/detect_test.go b/internal/k8s/detect_test.go index 01e88ed70..f497e1c30 100644 --- a/internal/k8s/detect_test.go +++ b/internal/k8s/detect_test.go @@ -66,8 +66,8 @@ func TestDetectProblems_PopulatesGroup(t *testing.T) { NumberUnavailable: 2, }, }, - // HPA at its replica ceiling — DetectHPAProblems flags - // "maxed" when current and desired both hit MaxReplicas. + // HPA capped by maxReplicas — DetectHPAProblems flags + // "maxed" when the controller reports TooManyReplicas. // The wrapper sets Group="autoscaling". &autoscalingv2.HorizontalPodAutoscaler{ ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"}, @@ -78,6 +78,9 @@ func TestDetectProblems_PopulatesGroup(t *testing.T) { Status: autoscalingv2.HorizontalPodAutoscalerStatus{ CurrentReplicas: 10, DesiredReplicas: 10, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, }, }, // Job stuck Active>0 for >1h with no completions. diff --git a/internal/k8s/detect_workload.go b/internal/k8s/detect_workload.go index 7309ee591..6b093a546 100644 --- a/internal/k8s/detect_workload.go +++ b/internal/k8s/detect_workload.go @@ -7,14 +7,15 @@ import ( autoscalingv2 "k8s.io/api/autoscaling/v2" batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" + + "github.com/skyhook-io/radar/pkg/hpadiag" ) // HPAProblem describes a detected issue with an HPA. type HPAProblem struct { Name string Namespace string - Problem string // "maxed" + Problem string // "maxed" or "cannot-scale" Reason string } @@ -29,42 +30,67 @@ type HPAProblem struct { func DetectHPAProblems(hpas []*autoscalingv2.HorizontalPodAutoscaler) []HPAProblem { var problems []HPAProblem for _, hpa := range hpas { - // "maxed" — at replica ceiling and wanting more. - if hpa.Spec.MaxReplicas > 0 && hpa.Status.CurrentReplicas >= hpa.Spec.MaxReplicas && hpa.Status.DesiredReplicas >= hpa.Spec.MaxReplicas { + diagnosis := hpadiag.Analyze(hpa) + if diagnosis == nil { + continue + } + + if reason, ok := firstHPAReason(diagnosis, hpadiag.ReasonLimitedMax); ok { problems = append(problems, HPAProblem{ Name: hpa.Name, Namespace: hpa.Namespace, Problem: "maxed", - Reason: fmt.Sprintf("%d/%d replicas (wants %d)", hpa.Status.CurrentReplicas, hpa.Spec.MaxReplicas, hpa.Status.DesiredReplicas), + Reason: maxedReasonText(diagnosis, reason), }) } - // "cannot scale" — the autoscaler controller reports it can't get - // metrics or scale calls are failing. Emitted as a separate problem - // so the maxed-check above isn't masked by an unrelated metrics - // outage on the same HPA. - for _, cond := range hpa.Status.Conditions { - if cond.Type == autoscalingv2.ScalingActive && cond.Status == corev1.ConditionFalse { - reason := cond.Reason - if reason == "" { - reason = "ScalingActive=False" - } - msg := cond.Message - if msg == "" { - msg = "HPA controller reports it cannot scale this workload" - } - problems = append(problems, HPAProblem{ - Name: hpa.Name, - Namespace: hpa.Namespace, - Problem: "cannot-scale", - Reason: fmt.Sprintf("%s: %s", reason, msg), - }) - break - } + + if reason, ok := firstHPAReason(diagnosis, hpadiag.ReasonUnableToScale, hpadiag.ReasonMetricsUnavailable); ok { + problems = append(problems, HPAProblem{ + Name: hpa.Name, + Namespace: hpa.Namespace, + Problem: "cannot-scale", + Reason: reasonText(reason), + }) } } return problems } +func firstHPAReason(diagnosis *hpadiag.Diagnosis, ids ...hpadiag.ReasonID) (hpadiag.Reason, bool) { + for _, id := range ids { + for _, reason := range diagnosis.Reasons { + if reason.ID == id { + return reason, true + } + } + } + return hpadiag.Reason{}, false +} + +func reasonText(reason hpadiag.Reason) string { + if reason.ConditionReason != "" && reason.Message != "" { + return reason.ConditionReason + ": " + reason.Message + } + if reason.Message != "" { + return reason.Message + } + return string(reason.ID) +} + +func maxedReasonText(diagnosis *hpadiag.Diagnosis, reason hpadiag.Reason) string { + if diagnosis == nil || diagnosis.Bounds.Max <= 0 { + return reasonText(reason) + } + text := fmt.Sprintf("%d/%d replicas", diagnosis.Bounds.Current, diagnosis.Bounds.Max) + if diagnosis.Bounds.Desired > 0 { + text += fmt.Sprintf(" (wants %d)", diagnosis.Bounds.Desired) + } + if detail := reasonText(reason); detail != "" { + return text + ": " + detail + } + return text +} + // CronJobProblem describes a detected issue with a CronJob. type CronJobProblem struct { Name string diff --git a/internal/k8s/detect_workload_test.go b/internal/k8s/detect_workload_test.go index 7650b1195..56f56a435 100644 --- a/internal/k8s/detect_workload_test.go +++ b/internal/k8s/detect_workload_test.go @@ -1,11 +1,14 @@ package k8s import ( + "strings" + "testing" + "time" + autoscalingv2 "k8s.io/api/autoscaling/v2" batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "testing" - "time" ) func TestDetectHPAProblems(t *testing.T) { @@ -14,6 +17,7 @@ func TestDetectHPAProblems(t *testing.T) { hpas []*autoscalingv2.HorizontalPodAutoscaler wantCount int wantProblem string + wantReason string }{ { name: "maxed HPA", @@ -21,11 +25,29 @@ func TestDetectHPAProblems(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, - Status: autoscalingv2.HorizontalPodAutoscalerStatus{CurrentReplicas: 10, DesiredReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, + }, }, }, wantCount: 1, wantProblem: "maxed", + wantReason: "10/10 replicas (wants 10): TooManyReplicas: the desired replica count is more than the maximum replica count", + }, + { + name: "at max without controller limit condition is not maxed", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{CurrentReplicas: 10, DesiredReplicas: 10}, + }, + }, + wantCount: 0, }, { name: "not maxed", @@ -60,6 +82,113 @@ func TestDetectHPAProblems(t *testing.T) { }, wantCount: 0, }, + { + name: "metrics unavailable", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 5, + DesiredReplicas: 5, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "FailedGetResourceMetric", Message: "missing cpu request"}, + }, + }, + }, + }, + wantCount: 1, + wantProblem: "cannot-scale", + }, + { + name: "maxed and metrics unavailable emit two distinct issues", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "FailedGetResourceMetric", Message: "missing cpu request"}, + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, + }, + }, + }, + wantCount: 2, + }, + { + name: "scaling disabled is not a metrics outage", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "paused", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 0, + DesiredReplicas: 0, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "ScalingDisabled", Message: "scaling is disabled since the replica count of the target is zero"}, + }, + }, + }, + }, + wantCount: 0, + }, + { + name: "pinned min equals max is not maxed", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "fixed", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MinReplicas: ptrInt32(5), + MaxReplicas: 5, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 5, + DesiredReplicas: 5, + }, + }, + }, + wantCount: 0, + }, + { + name: "min limited is drawer context only", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "idle", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MinReplicas: ptrInt32(2), + MaxReplicas: 10, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 2, + DesiredReplicas: 2, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooFewReplicas", Message: "the desired replica count is less than the minimum replica count"}, + }, + }, + }, + }, + wantCount: 0, + }, + { + name: "scale down stabilization is drawer context only", + hpas: []*autoscalingv2.HorizontalPodAutoscaler{ + { + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 5, + DesiredReplicas: 5, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "ScaleDownStabilized"}, + }, + }, + }, + }, + wantCount: 0, + }, } for _, tt := range tests { @@ -68,15 +197,22 @@ func TestDetectHPAProblems(t *testing.T) { if len(problems) != tt.wantCount { t.Errorf("DetectHPAProblems() returned %d problems, want %d", len(problems), tt.wantCount) } - if tt.wantCount > 0 && len(problems) > 0 { + if tt.wantProblem != "" && len(problems) > 0 { if problems[0].Problem != tt.wantProblem { t.Errorf("problem = %q, want %q", problems[0].Problem, tt.wantProblem) } } + if tt.wantReason != "" && len(problems) > 0 && !strings.Contains(problems[0].Reason, tt.wantReason) { + t.Errorf("reason = %q, want to contain %q", problems[0].Reason, tt.wantReason) + } }) } } +func ptrInt32(v int32) *int32 { + return &v +} + func TestDetectCronJobProblems(t *testing.T) { now := time.Now() suspended := true diff --git a/internal/server/server.go b/internal/server/server.go index a2c0ecfad..d5f3b38d0 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -21,6 +21,7 @@ import ( "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/go-chi/cors" + autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,6 +43,7 @@ import ( "github.com/skyhook-io/radar/internal/timeline" "github.com/skyhook-io/radar/internal/updater" "github.com/skyhook-io/radar/internal/version" + "github.com/skyhook-io/radar/pkg/hpadiag" "github.com/skyhook-io/radar/pkg/perfstats" "github.com/skyhook-io/radar/pkg/rbac" topology "github.com/skyhook-io/radar/pkg/topology" @@ -1587,6 +1589,14 @@ func setTypeMeta(resource any) { k8s.SetTypeMeta(resource) } +func hpaDiagnosisFor(resource any) *hpadiag.Diagnosis { + hpa, ok := resource.(*autoscalingv2.HorizontalPodAutoscaler) + if !ok { + return nil + } + return hpadiag.Analyze(hpa) +} + // preflightResourceGet runs the per-user RBAC gates that must pass before any // single-resource GET fetch. Mirrors the kind/scope-aware logic used by both // the REST handler (handleGetResource) and the AI handler (handleAIGetResource) @@ -1726,6 +1736,7 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { s.writeJSON(w, topology.ResourceWithRelationships{ Resource: resource, Relationships: relationships, + HPADiagnosis: hpaDiagnosisFor(resource), }) return } @@ -1941,6 +1952,7 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) { response := topology.ResourceWithRelationships{ Resource: resource, Relationships: relationships, + HPADiagnosis: hpaDiagnosisFor(resource), } // Enrich TLS secrets with parsed certificate info diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 0f7494c2d..cbd1d2cc7 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -12,6 +12,7 @@ import ( "testing" appsv1 "k8s.io/api/apps/v1" + autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -107,6 +108,22 @@ func TestMain(m *testing.M) { ReadyReplicas: 1, }, }, + &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-hpa", Namespace: "default", Generation: 1}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{APIVersion: "apps/v1", Kind: "Deployment", Name: "nginx"}, + MinReplicas: &replicas, + MaxReplicas: 2, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + ObservedGeneration: ptrInt64(1), + CurrentReplicas: 2, + DesiredReplicas: 2, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, + }, + }, &appsv1.ReplicaSet{ ObjectMeta: metav1.ObjectMeta{ Name: "nginx-abc", @@ -548,6 +565,30 @@ func TestSmokeGetDeployment(t *testing.T) { } } +func TestSmokeGetHPAIncludesDiagnosis(t *testing.T) { + resp, err := http.Get(testServer.URL + "/api/resources/hpas/default/nginx-hpa") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + var body map[string]any + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + diagnosis, ok := body["hpaDiagnosis"].(map[string]any) + if !ok { + t.Fatalf("missing hpaDiagnosis in response: %+v", body) + } + if diagnosis["state"] != "limited_max" { + t.Fatalf("hpaDiagnosis.state = %v, want limited_max", diagnosis["state"]) + } +} + func TestSmokeGetResourceNotFound(t *testing.T) { resp, err := http.Get(testServer.URL + "/api/resources/deployments/default/nonexistent") if err != nil { @@ -577,6 +618,8 @@ func TestSmokeEvents(t *testing.T) { func boolPtr(b bool) *bool { return &b } +func ptrInt64(v int64) *int64 { return &v } + // --- Helpers --- // get is a small helper that issues a GET and returns the response, failing the diff --git a/package-lock.json b/package-lock.json index 6b5f406f6..da7bb48fa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4124,6 +4124,7 @@ "shiki": "^4.2.0" }, "devDependencies": { + "@types/node": "^25.7.0", "@types/react": "^19.2.17", "@types/react-dom": "^19.2.3", "@xterm/addon-fit": "^0.11.0", diff --git a/packages/k8s-ui/package.json b/packages/k8s-ui/package.json index bb1f80521..68ab0e97f 100644 --- a/packages/k8s-ui/package.json +++ b/packages/k8s-ui/package.json @@ -85,6 +85,7 @@ "yaml": ">=2.0.0" }, "devDependencies": { + "@types/node": "^25.7.0", "@types/react": "^19.2.17", "@types/react-dom": "^19.2.3", "@xterm/addon-fit": "^0.11.0", diff --git a/packages/k8s-ui/src/components/resources/index.ts b/packages/k8s-ui/src/components/resources/index.ts index b8625f72f..748fd95f7 100644 --- a/packages/k8s-ui/src/components/resources/index.ts +++ b/packages/k8s-ui/src/components/resources/index.ts @@ -1,4 +1,5 @@ export * from './resource-utils' +export * from './resource-utils-hpa' export * from './resource-utils-argo' export * from './resource-utils-certmanager' export * from './resource-utils-cnpg' diff --git a/packages/k8s-ui/src/components/resources/renderers/HPARenderer.test.tsx b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.test.tsx new file mode 100644 index 000000000..e192700e5 --- /dev/null +++ b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.test.tsx @@ -0,0 +1,91 @@ +import { describe, expect, it } from 'vitest' +import { renderToString } from 'react-dom/server' +import { HPARenderer } from './HPARenderer' + +const baseHPA = { + metadata: { name: 'api-hpa', namespace: 'prod' }, + spec: { + maxReplicas: 10, + scaleTargetRef: { apiVersion: 'apps/v1', kind: 'Deployment', name: 'api' }, + }, + status: { + currentReplicas: 3, + desiredReplicas: 3, + }, +} + +describe('HPARenderer', () => { + it('does not count ScalingLimited=False as a failing condition', () => { + const html = renderToString( + , + ) + + expect(html).toContain('Conditions (3) · 1 failing') + expect(html).not.toContain('2 failing') + }) + + it('renders ScalingLimited=True at max as a warning instead of a healthy condition', () => { + const html = renderToString( + , + ) + + expect(html).toContain('border-amber-400/60') + expect(html).toContain('text-amber-600') + expect(html).not.toContain('failing') + }) + + it('does not repeat max-limited controller text as a second diagnosis headline', () => { + const html = renderToString( + , + ) + + expect(html).toContain('HPA wants more replicas but is capped at maxReplicas=10') + expect(html).toContain('Evidence') + expect(html).toContain('ScalingLimited') + expect(html).toContain('TooManyReplicas') + expect(html).not.toContain('the desired replica count is more than the maximum replica count') + }) +}) diff --git a/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx index d10b60c80..6de9b4e20 100644 --- a/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx +++ b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx @@ -1,114 +1,97 @@ import type { ReactNode } from 'react' -import { Cpu, AlertTriangle } from 'lucide-react' +import { Activity, Cpu } from 'lucide-react' import { clsx } from 'clsx' -import { Section, PropertyList, Property, ConditionsSection, ResourceLink } from '../../ui/drawer-components' +import { Section, PropertyList, Property, ConditionsSection, ResourceLink, type ConditionTone } from '../../ui/drawer-components' +import { Badge, type BadgeSeverity } from '../../ui/Badge' import { kindToPlural } from '../../../utils/navigation' import { formatAge } from '../resource-utils' +import { hpaStateLabel, hpaStateLevel } from '../resource-utils-hpa' +import type { HPADiagnosis, HPADiagnosisState } from '../../../types' interface HPARendererProps { data: any onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void + hpaDiagnosis?: HPADiagnosis /** Optional host-provided section rendered after Conditions — used to inject Prometheus-backed charts. */ extraSections?: ReactNode } -// Extract problems from HPA conditions -function getHPAProblems(data: any): string[] { - const problems: string[] = [] - const conditions = data.status?.conditions || [] - const status = data.status || {} - const spec = data.spec || {} - - // Check conditions for issues - for (const cond of conditions) { - // AbleToScale = False means target not found or can't scale - if (cond.type === 'AbleToScale' && cond.status === 'False') { - problems.push(`Cannot scale: ${cond.reason}${cond.message ? ' - ' + cond.message : ''}`) - } +function hpaBadgeSeverity(state: HPADiagnosisState): BadgeSeverity { + switch (hpaStateLevel(state)) { + case 'healthy': + return 'success' + case 'unhealthy': + return 'error' + case 'degraded': + return 'warning' + case 'alert': + return 'alert' + case 'neutral': + return 'info' + default: + return 'neutral' + } +} - // ScalingActive = False means metrics unavailable - if (cond.type === 'ScalingActive' && cond.status === 'False') { - if (cond.reason === 'FailedGetResourceMetric') { - problems.push('Metrics unavailable — is metrics-server running?') - } else { - problems.push(`Scaling inactive: ${cond.reason}${cond.message ? ' - ' + cond.message : ''}`) - } - } +function hpaConditionTone(condition: any): ConditionTone | undefined { + if (condition?.type !== 'ScalingLimited') return undefined + if (condition.status === 'False') return 'ok' + if (condition.status !== 'True') return 'unknown' - // ScalingLimited = True means at min/max bound - if (cond.type === 'ScalingLimited' && cond.status === 'True') { - if (cond.reason === 'TooFewReplicas') { - problems.push(`At minimum replicas (${spec.minReplicas || 1}) — cannot scale down further`) - } else if (cond.reason === 'TooManyReplicas') { - problems.push(`At maximum replicas (${spec.maxReplicas}) — cannot scale up further`) - } - } - } + const reason = String(condition.reason ?? '').toLowerCase() + const message = String(condition.message ?? '').toLowerCase() + if (reason.includes('toomany') || message.includes('maximum')) return 'warning' + if (reason.includes('toofew') || message.includes('minimum')) return 'ok' + return 'warning' +} - // Check for desired != current (scaling in progress or stuck) - if (status.currentReplicas !== undefined && status.desiredReplicas !== undefined) { - if (status.currentReplicas !== status.desiredReplicas) { - const direction = status.desiredReplicas > status.currentReplicas ? 'up' : 'down' - problems.push(`Scaling ${direction}: ${status.currentReplicas} → ${status.desiredReplicas} replicas`) - } - } +function formatReasonID(id: string): string { + return id.replace(/_/g, ' ') +} - return problems +function isReasonMessageRedundant(state: HPADiagnosisState, reason: NonNullable[number]): boolean { + return state === 'limited_max' && reason.id === 'limited_max' } -export function HPARenderer({ data, onNavigate, extraSections }: HPARendererProps) { +export function HPARenderer({ data, onNavigate, hpaDiagnosis, extraSections }: HPARendererProps) { const status = data.status || {} const spec = data.spec || {} const metrics = status.currentMetrics || [] - // Check for problems - const problems = getHPAProblems(data) - const hasProblems = problems.length > 0 - - // Determine if these are errors (red) or warnings (yellow) - const hasErrors = problems.some(p => - p.includes('Cannot scale') || p.includes('unavailable') || p.includes('inactive') - ) - return ( <> - {/* Problems/warnings alert */} - {hasProblems && ( -
-
- -
-
- {hasErrors ? 'Scaling Issues' : 'Scaling Status'} + {hpaDiagnosis && ( +
+
+
+
+
{hpaDiagnosis.summary}
+
+ {hpaDiagnosis.bounds.current}/{hpaDiagnosis.bounds.desired} replicas, bounds {hpaDiagnosis.bounds.min}-{hpaDiagnosis.bounds.max} +
-
    - {problems.map((problem, i) => ( -
  • - - {problem} -
  • - ))} -
+ {hpaStateLabel(hpaDiagnosis.state)}
+ {hpaDiagnosis.reasons && hpaDiagnosis.reasons.length > 0 && ( +
+ {hpaDiagnosis.reasons.map((reason) => ( +
+
+ Evidence + {formatReasonID(reason.id)} + {reason.conditionType && {reason.conditionType}} + {reason.conditionReason && {reason.conditionReason}} + {reason.detail && {reason.detail}} +
+ {!isReasonMessageRedundant(hpaDiagnosis.state, reason) && ( +
{reason.message}
+ )} +
+ ))} +
+ )}
-
+ )}
@@ -132,7 +115,31 @@ export function HPARenderer({ data, onNavigate, extraSections }: HPARendererProp
- {metrics.length > 0 && ( + {hpaDiagnosis?.metrics && hpaDiagnosis.metrics.length > 0 && ( +
+
+ {hpaDiagnosis.metrics.map((metric) => ( +
+
+
+
{metric.name}
+
{metric.type}
+
+
+ {metric.current && Current {metric.current}} + {metric.target && Target {metric.target}} + + {metric.status.replace(/_/g, ' ')} + +
+
+
+ ))} +
+
+ )} + + {!hpaDiagnosis?.metrics?.length && metrics.length > 0 && (
{metrics.map((metric: any, i: number) => { @@ -162,7 +169,7 @@ export function HPARenderer({ data, onNavigate, extraSections }: HPARendererProp
)} - + {extraSections} diff --git a/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.test.tsx b/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.test.tsx index 0a7a20c52..84bae60b1 100644 --- a/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.test.tsx +++ b/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.test.tsx @@ -34,8 +34,91 @@ describe('WorkloadRenderer', () => { expect(html).toContain('disabled=""') expect(html).toContain('Manual scaling is disabled') + expect(html).not.toContain('title="Manual scaling is disabled') expect(html).toContain('Controlled by') expect(html).toContain('HorizontalPodAutoscaler prod/api') expect(html).toContain('ScaledObject prod/api-queue') + expect(html).toContain('flex-wrap') + expect(html).toContain('hpa/') + }) + + it('renders compact HPA diagnosis inline with the state badge first', () => { + const html = renderToString( + {}} + scaleBlockedBy={[{ kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }]} + scalerDiagnostics={[ + { + ref: { kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }, + diagnosis: { + state: 'limited_max', + summary: 'HPA wants more replicas but is capped at maxReplicas=5', + target: { kind: 'Deployment', name: 'api' }, + bounds: { min: 2, max: 5, current: 5, desired: 5 }, + }, + }, + ]} + />, + ) + + expect(html.indexOf('Maxed')).toBeLessThan(html.indexOf('Wants more; capped at maxReplicas=5')) + expect(html).toContain('Wants more; capped at maxReplicas=5') + expect(html).not.toContain('HPA wants more replicas but is capped at maxReplicas=5') + expect(html).toContain('px-2 py-1.5') + }) + + it('uses compact missing-metrics copy in workload autoscaler context', () => { + const html = renderToString( + {}} + scaleBlockedBy={[{ kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }]} + scalerDiagnostics={[ + { + ref: { kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }, + diagnosis: { + state: 'metrics_unavailable', + summary: 'Add memory requests to the target pods so HPA can compute replicas', + target: { kind: 'Deployment', name: 'api' }, + bounds: { min: 2, max: 10, current: 3, desired: 3 }, + metrics: [{ type: 'Resource', name: 'memory', status: 'missing' }], + }, + }, + ]} + />, + ) + + expect(html).toContain('Metrics unavailable') + expect(html).toContain('Add memory requests; HPA cannot compute replicas') + expect(html).not.toContain('Add memory requests to the target pods') + }) + + it('uses compact pinned copy in workload autoscaler context', () => { + const html = renderToString( + {}} + scaleBlockedBy={[{ kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }]} + scalerDiagnostics={[ + { + ref: { kind: 'HorizontalPodAutoscaler', namespace: 'prod', name: 'api' }, + diagnosis: { + state: 'pinned', + summary: 'HPA is configured for a fixed replica count of 3', + target: { kind: 'Deployment', name: 'api' }, + bounds: { min: 3, max: 3, current: 3, desired: 3 }, + }, + }, + ]} + />, + ) + + expect(html).toContain('Pinned') + expect(html).toContain('Fixed at 3 replicas') + expect(html).not.toContain('fixed replica count') }) }) diff --git a/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.tsx b/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.tsx index cf4f9ffb6..99a13393b 100644 --- a/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.tsx +++ b/packages/k8s-ui/src/components/resources/renderers/WorkloadRenderer.tsx @@ -4,15 +4,24 @@ import { clsx } from 'clsx' import { Section, PropertyList, Property, ConditionsSection, PodTemplateSection, AlertBanner, ResourceLink, ResourceRefBadge } from '../../ui/drawer-components' import { DialogPortal } from '../../ui/DialogPortal' import { Tooltip } from '../../ui/Tooltip' -import type { RBACSubjectResponse, RBACPolicyRule, ResourceRef } from '../../../types' +import { Badge, type BadgeSeverity } from '../../ui/Badge' +import type { RBACSubjectResponse, RBACPolicyRule, ResourceRef, HPADiagnosis } from '../../../types' import { detectBlastRadius, rulePermissivenessScore } from '../../../utils/rbac-blast-radius' import { RBACErrorSection, isRBACUnavailable } from './RBACErrorSection' +import { hpaStateLabel, hpaStateLevel } from '../resource-utils-hpa' import { rbacVerbBadgeClass, rbacResourceBadgeClass, rbacApiGroupBadgeClass, } from '../../../utils/rbac-badges' +export interface ScalerDiagnosis { + ref: ResourceRef + diagnosis?: HPADiagnosis + loading?: boolean + error?: string +} + interface WorkloadRendererProps { kind: string data: any @@ -21,6 +30,7 @@ interface WorkloadRendererProps { onScale?: (replicas: number) => Promise isScalePending?: boolean scaleBlockedBy?: ResourceRef[] + scalerDiagnostics?: ScalerDiagnosis[] onRequestRefresh?: () => void /** * RBAC reverse-lookup for the workload's pod-template ServiceAccount. @@ -110,7 +120,27 @@ function formatScalerLabel(ref: ResourceRef): string { return `${ref.kind} ${prefix}${ref.name}` } -export function WorkloadRenderer({ kind, data, onNavigate, onViewPods, onScale, isScalePending, scaleBlockedBy, onRequestRefresh, rbacData, rbacLoading, rbacError }: WorkloadRendererProps) { +function compactHPASummary(diagnosis: HPADiagnosis): string { + if (diagnosis.state === 'limited_max') { + return `Wants more; capped at maxReplicas=${diagnosis.bounds.max}` + } + if (diagnosis.state === 'pinned') { + return `Fixed at ${diagnosis.bounds.current} replicas` + } + if (diagnosis.state === 'metrics_unavailable' || diagnosis.state === 'metrics_incomplete') { + const missingMetric = diagnosis.metrics?.find((metric) => metric.status !== 'ok') + if (missingMetric?.type === 'Resource' && missingMetric.name) { + return `Add ${missingMetric.name} requests; HPA cannot compute replicas` + } + if (missingMetric?.name) { + return `${missingMetric.name} metric unavailable; HPA cannot compute replicas` + } + return 'Metrics unavailable; HPA cannot compute replicas' + } + return diagnosis.summary +} + +export function WorkloadRenderer({ kind, data, onNavigate, onViewPods, onScale, isScalePending, scaleBlockedBy, scalerDiagnostics, onRequestRefresh, rbacData, rbacLoading, rbacError }: WorkloadRendererProps) { const status = data.status || {} const spec = data.spec || {} const metadata = data.metadata || {} @@ -226,8 +256,15 @@ export function WorkloadRenderer({ kind, data, onNavigate, onViewPods, onScale, value={
{scaleBlockedBy.map((ref) => ( - + ))} + {scalerDiagnostics && scalerDiagnostics.length > 0 && ( +
+ {scalerDiagnostics.map((entry) => ( + + ))} +
+ )}
} /> @@ -261,7 +298,6 @@ export function WorkloadRenderer({ kind, data, onNavigate, onViewPods, onScale, ) } return ( - {kindName}/ - {resourceRef.name} + {content} ) } @@ -922,6 +960,7 @@ function formatKindForRef(kind: string): string { job: 'job', cronjob: 'cj', hpa: 'hpa', + horizontalpodautoscaler: 'hpa', } return shortNames[k] || k } diff --git a/packages/k8s-ui/src/components/workload/WorkloadView.tsx b/packages/k8s-ui/src/components/workload/WorkloadView.tsx index d22551ab0..f5d1df2f7 100644 --- a/packages/k8s-ui/src/components/workload/WorkloadView.tsx +++ b/packages/k8s-ui/src/components/workload/WorkloadView.tsx @@ -22,7 +22,7 @@ import { BarChart3, Network, } from 'lucide-react' -import type { TimelineEvent, ResourceRef, Relationships, SelectedResource, ResolvedEnvFrom, Topology, TopologyNode } from '../../types' +import type { TimelineEvent, ResourceRef, Relationships, SelectedResource, ResolvedEnvFrom, Topology, TopologyNode, HPADiagnosis } from '../../types' import type { GitOpsStatus } from '../../types/gitops' import type { NavigateToResource } from '../../utils/navigation' import { refToSelectedResource, pluralToKind, kindToPlural, apiVersionToGroup } from '../../utils/navigation' @@ -49,6 +49,7 @@ import { import { ResourceActionsBar } from '../shared/ResourceActionsBar' import { EditableYamlView, SaveSuccessAnimation } from '../shared/EditableYamlView' import { ResourceRendererDispatch, getResourceStatus, type RendererOverrides } from '../shared/ResourceRendererDispatch' +import type { ScalerDiagnosis } from '../resources/renderers/WorkloadRenderer' import { DetailShell, type DetailShellTab } from '../shared/DetailShell' import { HelmManagedByChip, ManagedByChip, type HelmOwnerRef } from '../shared/ManagedByChip' import { getKindColorOutline, displayKindName } from '../ui/drawer-components' @@ -105,6 +106,10 @@ interface WorkloadViewProps { relationships?: Relationships /** TLS certificate info for secrets */ certificateInfo?: any + /** HPA diagnosis for HorizontalPodAutoscaler detail responses */ + hpaDiagnosis?: HPADiagnosis + /** Compact diagnosis for autoscalers controlling this workload */ + scalerDiagnostics?: ScalerDiagnosis[] /** Whether the resource is loading */ isLoading?: boolean /** Fetch error for the resource (preserves status + message so the @@ -244,6 +249,8 @@ export function WorkloadView({ resource, relationships, certificateInfo, + hpaDiagnosis, + scalerDiagnostics, isLoading: resourceLoading = false, resourceError, refetch: refetchProp, @@ -651,6 +658,8 @@ export function WorkloadView({ data={resource} relationships={relationships} certificateInfo={certificateInfo} + hpaDiagnosis={hpaDiagnosis} + scalerDiagnostics={scalerDiagnostics} onCopy={copyToClipboard} copied={copied} onNavigate={onNavigateToResource ? (ref) => onNavigateToResource(refToSelectedResource(ref)) : undefined} @@ -774,6 +783,8 @@ export function WorkloadView({ resource={resource} selectedResource={selectedResource} relationships={relationships} + hpaDiagnosis={hpaDiagnosis} + scalerDiagnostics={scalerDiagnostics} isLoading={resourceLoading} error={resourceError} onNavigate={onNavigateToResource} @@ -1305,6 +1316,8 @@ function InfoTab({ resource, selectedResource, relationships, + hpaDiagnosis, + scalerDiagnostics, isLoading, error, onNavigate, @@ -1326,6 +1339,8 @@ function InfoTab({ resource: any selectedResource: SelectedResource relationships?: Relationships + hpaDiagnosis?: HPADiagnosis + scalerDiagnostics?: ScalerDiagnosis[] isLoading: boolean error?: unknown onNavigate?: NavigateToResource @@ -1354,6 +1369,8 @@ function InfoTab({ resource={selectedResource} data={resource} relationships={relationships} + hpaDiagnosis={hpaDiagnosis} + scalerDiagnostics={scalerDiagnostics} onCopy={onCopy} copied={copied} onNavigate={onNavigate ? (ref) => onNavigate(refToSelectedResource(ref)) : undefined} diff --git a/packages/k8s-ui/src/types/core.ts b/packages/k8s-ui/src/types/core.ts index 72627eec7..60396d21e 100644 --- a/packages/k8s-ui/src/types/core.ts +++ b/packages/k8s-ui/src/types/core.ts @@ -462,11 +462,63 @@ export interface SecretCertificateInfo { certificates: CertificateInfo[] } +export type HPADiagnosisState = + | 'ok' + | 'scaling_up' + | 'scaling_down' + | 'limited_max' + | 'limited_min' + | 'metrics_unavailable' + | 'metrics_incomplete' + | 'unable_to_scale' + | 'disabled' + | 'pinned' + | 'stale' + | 'stabilized' + | 'unknown' + +export interface HPADiagnosis { + state: HPADiagnosisState + summary: string + target: { + apiVersion?: string + kind?: string + name?: string + } + bounds: { + min: number + max: number + current: number + desired: number + observedGeneration?: number + generation?: number + } + metrics?: HPAMetricSummary[] + reasons?: HPAReasonSummary[] +} + +export interface HPAReasonSummary { + id: string + message: string + detail?: string + conditionType?: string + conditionReason?: string +} + +export interface HPAMetricSummary { + type: string + name: string + current?: string + target?: string + status: string +} + // Resource with computed relationships and optional certificate info (API response wrapper) export interface ResourceWithRelationships { resource: T relationships?: Relationships certificateInfo?: SecretCertificateInfo + hpaDiagnosis?: HPADiagnosis } // API Resource (from discovery endpoint) diff --git a/packages/k8s-ui/tsconfig.json b/packages/k8s-ui/tsconfig.json index b751ff012..2c864fbd3 100644 --- a/packages/k8s-ui/tsconfig.json +++ b/packages/k8s-ui/tsconfig.json @@ -9,6 +9,7 @@ "isolatedModules": true, "noEmit": true, "jsx": "react-jsx", + "types": ["node"], "strict": true, "noUnusedLocals": true, "noUnusedParameters": true, diff --git a/pkg/ai/context/summary.go b/pkg/ai/context/summary.go index fd27d4c6f..0076b9d4e 100644 --- a/pkg/ai/context/summary.go +++ b/pkg/ai/context/summary.go @@ -15,6 +15,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/skyhook-io/radar/pkg/hpadiag" "github.com/skyhook-io/radar/pkg/resourcecontext" ) @@ -447,8 +448,13 @@ func summarizeHPA(hpa *autoscalingv2.HorizontalPodAutoscaler) *ResourceSummary { s.MinReplicas = hpa.Spec.MinReplicas } - if hpa.Spec.MaxReplicas > 0 && hpa.Status.CurrentReplicas >= hpa.Spec.MaxReplicas && hpa.Status.DesiredReplicas >= hpa.Spec.MaxReplicas { - s.Issue = "maxed" + if diagnosis := hpadiag.Analyze(hpa); diagnosis != nil { + switch diagnosis.State { + case hpadiag.StateLimitedMax: + s.Issue = "maxed" + case hpadiag.StateMetricsUnavailable, hpadiag.StateUnableToScale: + s.Issue = diagnosis.Summary + } } return s diff --git a/pkg/ai/context/summary_test.go b/pkg/ai/context/summary_test.go index efd83630b..c4d715287 100644 --- a/pkg/ai/context/summary_test.go +++ b/pkg/ai/context/summary_test.go @@ -579,7 +579,13 @@ func TestSummary_HPAIssue(t *testing.T) { MaxReplicas: 10, ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{Kind: "Deployment", Name: "web"}, }, - Status: autoscalingv2.HorizontalPodAutoscalerStatus{CurrentReplicas: 10, DesiredReplicas: 10}, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 10, + DesiredReplicas: 10, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, + }, }, wantIssue: "maxed", }, @@ -595,6 +601,42 @@ func TestSummary_HPAIssue(t *testing.T) { }, wantIssue: "", }, + { + name: "metrics unavailable", + hpa: &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MaxReplicas: 10, + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{Kind: "Deployment", Name: "web"}, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 5, + DesiredReplicas: 5, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "FailedGetResourceMetric"}, + }, + }, + }, + wantIssue: "HPA cannot compute replicas because required metrics are unavailable", + }, + { + name: "scaling disabled is not an issue", + hpa: &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "paused", Namespace: "default"}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MaxReplicas: 10, + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{Kind: "Deployment", Name: "paused"}, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 0, + DesiredReplicas: 0, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "ScalingDisabled"}, + }, + }, + }, + wantIssue: "", + }, } for _, tt := range tests { diff --git a/pkg/hpadiag/diagnosis.go b/pkg/hpadiag/diagnosis.go new file mode 100644 index 000000000..edacf2361 --- /dev/null +++ b/pkg/hpadiag/diagnosis.go @@ -0,0 +1,374 @@ +package hpadiag + +import ( + "fmt" + "strings" + + autoscalingv2 "k8s.io/api/autoscaling/v2" + corev1 "k8s.io/api/core/v1" +) + +type State string + +const ( + StateOK State = "ok" + StateScalingUp State = "scaling_up" + StateScalingDown State = "scaling_down" + StateLimitedMax State = "limited_max" + StateLimitedMin State = "limited_min" + StateMetricsUnavailable State = "metrics_unavailable" + StateMetricsIncomplete State = "metrics_incomplete" + StateUnableToScale State = "unable_to_scale" + StateDisabled State = "disabled" + StatePinned State = "pinned" + StateStale State = "stale" + StateStabilized State = "stabilized" + StateUnknown State = "unknown" +) + +type ReasonID string + +const ( + ReasonScalingUp ReasonID = "scaling_up" + ReasonScalingDown ReasonID = "scaling_down" + ReasonLimitedMax ReasonID = "limited_max" + ReasonLimitedMin ReasonID = "limited_min" + ReasonMetricsUnavailable ReasonID = "metrics_unavailable" + ReasonUnableToScale ReasonID = "unable_to_scale" + ReasonScalingDisabled ReasonID = "scaling_disabled" + ReasonPinned ReasonID = "pinned" + ReasonStaleStatus ReasonID = "stale_status" + ReasonScaleDownStabilized ReasonID = "scale_down_stabilized" + ReasonMissingCurrentMetric ReasonID = "missing_current_metric" +) + +type Diagnosis struct { + State State `json:"state"` + Summary string `json:"summary"` + Target TargetRef `json:"target"` + Bounds ReplicaBounds `json:"bounds"` + Metrics []MetricSummary `json:"metrics,omitempty"` + Reasons []Reason `json:"reasons,omitempty"` +} + +type TargetRef struct { + APIVersion string `json:"apiVersion,omitempty"` + Kind string `json:"kind,omitempty"` + Name string `json:"name,omitempty"` +} + +type ReplicaBounds struct { + Min int32 `json:"min"` + Max int32 `json:"max"` + Current int32 `json:"current"` + Desired int32 `json:"desired"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + Generation int64 `json:"generation,omitempty"` +} + +type Reason struct { + ID ReasonID `json:"id"` + Message string `json:"message"` + Detail string `json:"detail,omitempty"` + ConditionType string `json:"conditionType,omitempty"` + ConditionReason string `json:"conditionReason,omitempty"` +} + +type MetricSummary struct { + Type string `json:"type"` + Name string `json:"name"` + Current string `json:"current,omitempty"` + Target string `json:"target,omitempty"` + Status string `json:"status"` +} + +func Analyze(hpa *autoscalingv2.HorizontalPodAutoscaler) *Diagnosis { + if hpa == nil { + return nil + } + + min := int32(1) + if hpa.Spec.MinReplicas != nil { + min = *hpa.Spec.MinReplicas + } + observedGeneration := int64(0) + if hpa.Status.ObservedGeneration != nil { + observedGeneration = *hpa.Status.ObservedGeneration + } + + d := &Diagnosis{ + State: StateOK, + Target: TargetRef{ + APIVersion: hpa.Spec.ScaleTargetRef.APIVersion, + Kind: hpa.Spec.ScaleTargetRef.Kind, + Name: hpa.Spec.ScaleTargetRef.Name, + }, + Bounds: ReplicaBounds{ + Min: min, + Max: hpa.Spec.MaxReplicas, + Current: hpa.Status.CurrentReplicas, + Desired: hpa.Status.DesiredReplicas, + ObservedGeneration: observedGeneration, + Generation: hpa.Generation, + }, + } + + conditions := mapConditions(hpa.Status.Conditions) + + if cond, ok := conditions[autoscalingv2.AbleToScale]; ok && cond.Status == corev1.ConditionFalse { + d.addConditionReason(ReasonUnableToScale, cond, "HPA controller cannot scale the target") + } + + if cond, ok := conditions[autoscalingv2.ScalingActive]; ok && cond.Status == corev1.ConditionFalse { + if isScalingDisabled(cond) { + d.addConditionReason(ReasonScalingDisabled, cond, "HPA scaling is disabled because the target has zero replicas") + } else { + d.addConditionReason(ReasonMetricsUnavailable, cond, "HPA controller cannot read scaling metrics") + } + } + + if cond, ok := conditions[autoscalingv2.ScalingLimited]; ok && cond.Status == corev1.ConditionTrue { + reason := strings.ToLower(cond.Reason) + message := strings.ToLower(cond.Message) + switch { + case isPinned(min, hpa.Spec.MaxReplicas) && (strings.Contains(reason, "toomany") || strings.Contains(reason, "toofew") || strings.Contains(message, "maximum") || strings.Contains(message, "minimum")): + d.addConditionReason(ReasonPinned, cond, fmt.Sprintf("HPA is pinned at %d replicas", hpa.Spec.MaxReplicas)) + case strings.Contains(reason, "toomany") || strings.Contains(message, "maximum"): + d.addConditionReason(ReasonLimitedMax, cond, fmt.Sprintf("HPA is capped at maxReplicas=%d", hpa.Spec.MaxReplicas)) + case strings.Contains(reason, "toofew") || strings.Contains(message, "minimum"): + d.addConditionReason(ReasonLimitedMin, cond, fmt.Sprintf("HPA is held at minReplicas=%d", min)) + case strings.Contains(reason, "stabiliz") || strings.Contains(message, "stabiliz"): + d.addConditionReason(ReasonScaleDownStabilized, cond, "HPA is holding replicas because of scale-down stabilization") + } + } + + if hpa.Generation > 0 && observedGeneration > 0 && observedGeneration < hpa.Generation { + d.Reasons = append(d.Reasons, Reason{ + ID: ReasonStaleStatus, + Message: "HPA status has not observed the latest spec generation yet", + Detail: fmt.Sprintf("observed generation %d, current generation %d", observedGeneration, hpa.Generation), + }) + } + + switch { + case hpa.Status.DesiredReplicas > hpa.Status.CurrentReplicas: + d.Reasons = append(d.Reasons, Reason{ + ID: ReasonScalingUp, + Message: fmt.Sprintf("Scaling up from %d to %d replicas", hpa.Status.CurrentReplicas, hpa.Status.DesiredReplicas), + }) + case hpa.Status.DesiredReplicas < hpa.Status.CurrentReplicas: + d.Reasons = append(d.Reasons, Reason{ + ID: ReasonScalingDown, + Message: fmt.Sprintf("Scaling down from %d to %d replicas", hpa.Status.CurrentReplicas, hpa.Status.DesiredReplicas), + }) + } + + d.Metrics = summarizeMetrics(hpa) + if len(hpa.Status.CurrentMetrics) > 0 { + if missing := missingMetricNames(d.Metrics); len(missing) > 0 && !d.hasReason(ReasonMetricsUnavailable) { + d.Reasons = append(d.Reasons, Reason{ + ID: ReasonMissingCurrentMetric, + Message: "HPA status is missing current values for one or more configured metrics", + Detail: strings.Join(missing, ", "), + }) + } + } + if isPinned(min, hpa.Spec.MaxReplicas) && + hpa.Status.CurrentReplicas == hpa.Status.DesiredReplicas && + hpa.Status.DesiredReplicas == hpa.Spec.MaxReplicas && + !d.hasReason(ReasonPinned) { + d.Reasons = append(d.Reasons, Reason{ + ID: ReasonPinned, + Message: fmt.Sprintf("HPA is pinned at %d replicas", hpa.Spec.MaxReplicas), + }) + } + + d.State = chooseState(d) + d.Summary = summarizeState(d) + return d +} + +func mapConditions(conditions []autoscalingv2.HorizontalPodAutoscalerCondition) map[autoscalingv2.HorizontalPodAutoscalerConditionType]autoscalingv2.HorizontalPodAutoscalerCondition { + out := make(map[autoscalingv2.HorizontalPodAutoscalerConditionType]autoscalingv2.HorizontalPodAutoscalerCondition, len(conditions)) + for _, cond := range conditions { + out[cond.Type] = cond + } + return out +} + +func isScalingDisabled(cond autoscalingv2.HorizontalPodAutoscalerCondition) bool { + return strings.EqualFold(cond.Reason, "ScalingDisabled") || strings.Contains(strings.ToLower(cond.Message), "scaling is disabled") +} + +func isPinned(min, max int32) bool { + return max > 0 && min == max +} + +func (d *Diagnosis) addConditionReason(id ReasonID, cond autoscalingv2.HorizontalPodAutoscalerCondition, fallback string) { + message := cond.Message + if message == "" { + message = fallback + } + d.Reasons = append(d.Reasons, Reason{ + ID: id, + Message: message, + ConditionType: string(cond.Type), + ConditionReason: cond.Reason, + }) +} + +func (d *Diagnosis) hasReason(id ReasonID) bool { + for _, reason := range d.Reasons { + if reason.ID == id { + return true + } + } + return false +} + +func chooseState(d *Diagnosis) State { + switch { + case d.hasReason(ReasonUnableToScale): + return StateUnableToScale + case d.hasReason(ReasonMetricsUnavailable): + return StateMetricsUnavailable + case d.hasReason(ReasonLimitedMax): + return StateLimitedMax + case d.hasReason(ReasonScalingDisabled): + return StateDisabled + case d.hasReason(ReasonPinned): + return StatePinned + case d.hasReason(ReasonScalingUp): + return StateScalingUp + case d.hasReason(ReasonScalingDown): + return StateScalingDown + case d.hasReason(ReasonScaleDownStabilized): + return StateStabilized + case d.hasReason(ReasonLimitedMin): + return StateLimitedMin + case d.hasReason(ReasonMissingCurrentMetric): + return StateMetricsIncomplete + case d.hasReason(ReasonStaleStatus): + return StateStale + default: + return StateOK + } +} + +func summarizeState(d *Diagnosis) string { + switch d.State { + case StateUnableToScale: + return "HPA cannot read or update the target scale" + case StateMetricsUnavailable: + if metric := missingRequestMetric(d); metric != "" { + return fmt.Sprintf("Add %s requests to the target pods so HPA can compute replicas", metric) + } + return "HPA cannot compute replicas because required metrics are unavailable" + case StateMetricsIncomplete: + if detail := firstReasonDetail(d, ReasonMissingCurrentMetric); detail != "" { + return fmt.Sprintf("HPA is missing current metric values for %s", detail) + } + return "HPA is missing current metric values" + case StateLimitedMax: + if d.Bounds.Max > 0 { + if controllerReportedMaxLimit(d) { + return fmt.Sprintf("HPA wants more replicas but is capped at maxReplicas=%d", d.Bounds.Max) + } + return fmt.Sprintf("HPA is at maxReplicas=%d", d.Bounds.Max) + } + return "HPA is capped at maxReplicas" + case StateDisabled: + return "HPA scaling is disabled because the target has zero replicas" + case StatePinned: + if d.Bounds.Max > 0 { + return fmt.Sprintf("HPA is configured for a fixed replica count of %d", d.Bounds.Max) + } + return "HPA is configured for a fixed replica count" + case StateLimitedMin: + return fmt.Sprintf("HPA is holding at minReplicas=%d", d.Bounds.Min) + case StateStale: + return "HPA has not observed the latest spec generation yet" + case StateScalingUp: + return firstReasonMessage(d, ReasonScalingUp, "HPA is scaling up") + case StateScalingDown: + return firstReasonMessage(d, ReasonScalingDown, "HPA is scaling down") + case StateStabilized: + return "HPA is holding replicas during scale-down stabilization" + case StateOK: + return "HPA is within configured bounds" + default: + return "HPA status is unknown" + } +} + +func firstReasonMessage(d *Diagnosis, id ReasonID, fallback string) string { + for _, reason := range d.Reasons { + if reason.ID == id && reason.Message != "" { + return reason.Message + } + } + return fallback +} + +func firstReasonDetail(d *Diagnosis, id ReasonID) string { + for _, reason := range d.Reasons { + if reason.ID == id && reason.Detail != "" { + return reason.Detail + } + } + return "" +} + +func missingRequestMetric(d *Diagnosis) string { + message := strings.ToLower(firstReasonMessage(d, ReasonMetricsUnavailable, "")) + const marker = "missing request for " + idx := strings.Index(message, marker) + if idx < 0 { + return "" + } + rest := strings.TrimSpace(message[idx+len(marker):]) + if rest == "" { + return "" + } + fields := strings.Fields(rest) + if len(fields) == 0 { + return "" + } + metric := strings.Trim(fields[0], `.,;:()[]{}"'`) + return formatMetricName(metric) +} + +func formatMetricName(name string) string { + switch strings.ToLower(name) { + case "cpu": + return "CPU" + case "memory": + return "memory" + default: + return name + } +} + +func controllerReportedMaxLimit(d *Diagnosis) bool { + for _, reason := range d.Reasons { + if reason.ID != ReasonLimitedMax { + continue + } + conditionReason := strings.ToLower(reason.ConditionReason) + message := strings.ToLower(reason.Message) + if strings.Contains(conditionReason, "toomany") || strings.Contains(message, "maximum") { + return true + } + } + return false +} + +func missingMetricNames(metrics []MetricSummary) []string { + var out []string + for _, metric := range metrics { + if metric.Status == "missing" { + out = append(out, metric.Name) + } + } + return out +} diff --git a/pkg/hpadiag/diagnosis_test.go b/pkg/hpadiag/diagnosis_test.go new file mode 100644 index 000000000..84a80a1ff --- /dev/null +++ b/pkg/hpadiag/diagnosis_test.go @@ -0,0 +1,169 @@ +package hpadiag + +import ( + "encoding/json" + "os" + "path/filepath" + "reflect" + "testing" + + autoscalingv2 "k8s.io/api/autoscaling/v2" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type fixtureCase struct { + Name string `json:"name"` + HPA json.RawMessage `json:"hpa"` + ExpectedState State `json:"expectedState"` + ExpectedReasons []ReasonID `json:"expectedReasons"` + ExpectedSummary string `json:"expectedSummary,omitempty"` +} + +func TestAnalyzeFixtures(t *testing.T) { + for _, tc := range loadFixtureCases(t) { + t.Run(tc.Name, func(t *testing.T) { + var hpa autoscalingv2.HorizontalPodAutoscaler + if err := json.Unmarshal(tc.HPA, &hpa); err != nil { + t.Fatalf("unmarshal HPA: %v", err) + } + + got := Analyze(&hpa) + if got == nil { + t.Fatal("Analyze returned nil") + } + if got.State != tc.ExpectedState { + t.Fatalf("state = %q, want %q; diagnosis=%+v", got.State, tc.ExpectedState, got) + } + if gotReasons := reasonIDs(got); !reflect.DeepEqual(gotReasons, tc.ExpectedReasons) { + t.Fatalf("reasons = %v, want %v; diagnosis=%+v", gotReasons, tc.ExpectedReasons, got) + } + if tc.ExpectedSummary != "" && got.Summary != tc.ExpectedSummary { + t.Fatalf("summary = %q, want %q; diagnosis=%+v", got.Summary, tc.ExpectedSummary, got) + } + }) + } +} + +func TestAnalyzeFormatsResourceMetric(t *testing.T) { + tc := loadFixtureByName(t, "stable") + var hpa autoscalingv2.HorizontalPodAutoscaler + if err := json.Unmarshal(tc.HPA, &hpa); err != nil { + t.Fatalf("unmarshal HPA: %v", err) + } + got := Analyze(&hpa) + if len(got.Metrics) != 1 { + t.Fatalf("metrics len = %d, want 1", len(got.Metrics)) + } + metric := got.Metrics[0] + if metric.Name != "cpu" || metric.Current != "55% utilization" || metric.Target != "70% utilization" || metric.Status != "ok" { + t.Fatalf("metric = %+v", metric) + } +} + +func TestAnalyzeSkipsEmptyStatusOnlyMetric(t *testing.T) { + target := int32(80) + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MaxReplicas: 10, + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "api", + }, + Metrics: []autoscalingv2.MetricSpec{{ + Type: autoscalingv2.ResourceMetricSourceType, + Resource: &autoscalingv2.ResourceMetricSource{ + Name: corev1.ResourceCPU, + Target: autoscalingv2.MetricTarget{ + Type: autoscalingv2.UtilizationMetricType, + AverageUtilization: &target, + }, + }, + }}, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + CurrentReplicas: 3, + DesiredReplicas: 3, + CurrentMetrics: []autoscalingv2.MetricStatus{{}}, + }, + } + + got := Analyze(hpa) + if len(got.Metrics) != 1 { + t.Fatalf("metrics len = %d, want 1; metrics=%+v", len(got.Metrics), got.Metrics) + } + metric := got.Metrics[0] + if metric.Name != "cpu" || metric.Status != "missing" { + t.Fatalf("metric = %+v, want missing cpu metric only", metric) + } +} + +func TestAnalyzePrefersScalingOverStaleStatus(t *testing.T) { + observedGeneration := int64(2) + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + Namespace: "default", + Generation: 3, + }, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + MaxReplicas: 10, + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "worker", + }, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + ObservedGeneration: &observedGeneration, + CurrentReplicas: 2, + DesiredReplicas: 5, + }, + } + + got := Analyze(hpa) + if got.State != StateScalingUp { + t.Fatalf("state = %q, want %q; diagnosis=%+v", got.State, StateScalingUp, got) + } + if got.Summary != "Scaling up from 2 to 5 replicas" { + t.Fatalf("summary = %q", got.Summary) + } + wantReasons := []ReasonID{ReasonStaleStatus, ReasonScalingUp} + if gotReasons := reasonIDs(got); !reflect.DeepEqual(gotReasons, wantReasons) { + t.Fatalf("reasons = %v, want %v; diagnosis=%+v", gotReasons, wantReasons, got) + } +} + +func loadFixtureCases(t *testing.T) []fixtureCase { + t.Helper() + path := filepath.Join("..", "..", "testdata", "hpa-diagnosis", "cases.json") + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read fixtures: %v", err) + } + var cases []fixtureCase + if err := json.Unmarshal(raw, &cases); err != nil { + t.Fatalf("unmarshal fixtures: %v", err) + } + return cases +} + +func loadFixtureByName(t *testing.T, name string) fixtureCase { + t.Helper() + for _, tc := range loadFixtureCases(t) { + if tc.Name == name { + return tc + } + } + t.Fatalf("fixture %q not found", name) + return fixtureCase{} +} + +func reasonIDs(d *Diagnosis) []ReasonID { + out := make([]ReasonID, 0, len(d.Reasons)) + for _, reason := range d.Reasons { + out = append(out, reason.ID) + } + return out +} diff --git a/pkg/hpadiag/metrics.go b/pkg/hpadiag/metrics.go new file mode 100644 index 000000000..01bc7735f --- /dev/null +++ b/pkg/hpadiag/metrics.go @@ -0,0 +1,204 @@ +package hpadiag + +import ( + "fmt" + + autoscalingv2 "k8s.io/api/autoscaling/v2" +) + +func summarizeMetrics(hpa *autoscalingv2.HorizontalPodAutoscaler) []MetricSummary { + if hpa == nil || (len(hpa.Spec.Metrics) == 0 && len(hpa.Status.CurrentMetrics) == 0) { + return nil + } + + currentByKey := make(map[string]autoscalingv2.MetricStatus, len(hpa.Status.CurrentMetrics)) + for _, current := range hpa.Status.CurrentMetrics { + currentByKey[metricStatusKey(current)] = current + } + + seen := make(map[string]struct{}, len(hpa.Spec.Metrics)) + out := make([]MetricSummary, 0, len(hpa.Spec.Metrics)) + for _, spec := range hpa.Spec.Metrics { + key := metricSpecKey(spec) + seen[key] = struct{}{} + current, ok := currentByKey[key] + status := "ok" + currentValue := "" + if ok { + currentValue = formatMetricCurrent(current) + } + if !ok || currentValue == "" { + status = "missing" + } + out = append(out, MetricSummary{ + Type: string(spec.Type), + Name: metricSpecName(spec), + Current: currentValue, + Target: formatMetricTarget(spec), + Status: status, + }) + } + + for _, current := range hpa.Status.CurrentMetrics { + key := metricStatusKey(current) + if _, ok := seen[key]; ok { + continue + } + name := metricStatusName(current) + currentValue := formatMetricCurrent(current) + if name == "unknown" && currentValue == "" { + continue + } + out = append(out, MetricSummary{ + Type: string(current.Type), + Name: name, + Current: currentValue, + Status: "status_only", + }) + } + + return out +} + +func metricSpecKey(metric autoscalingv2.MetricSpec) string { + return string(metric.Type) + "/" + metricSpecName(metric) +} + +func metricStatusKey(metric autoscalingv2.MetricStatus) string { + return string(metric.Type) + "/" + metricStatusName(metric) +} + +func metricSpecName(metric autoscalingv2.MetricSpec) string { + switch metric.Type { + case autoscalingv2.ResourceMetricSourceType: + if metric.Resource != nil { + return metric.Resource.Name.String() + } + case autoscalingv2.ContainerResourceMetricSourceType: + if metric.ContainerResource != nil { + return fmt.Sprintf("%s/%s", metric.ContainerResource.Container, metric.ContainerResource.Name.String()) + } + case autoscalingv2.PodsMetricSourceType: + if metric.Pods != nil { + return metric.Pods.Metric.Name + } + case autoscalingv2.ObjectMetricSourceType: + if metric.Object != nil { + return fmt.Sprintf("%s/%s/%s", metric.Object.DescribedObject.Kind, metric.Object.DescribedObject.Name, metric.Object.Metric.Name) + } + case autoscalingv2.ExternalMetricSourceType: + if metric.External != nil { + return metric.External.Metric.Name + } + } + return "unknown" +} + +func metricStatusName(metric autoscalingv2.MetricStatus) string { + switch metric.Type { + case autoscalingv2.ResourceMetricSourceType: + if metric.Resource != nil { + return metric.Resource.Name.String() + } + case autoscalingv2.ContainerResourceMetricSourceType: + if metric.ContainerResource != nil { + return fmt.Sprintf("%s/%s", metric.ContainerResource.Container, metric.ContainerResource.Name.String()) + } + case autoscalingv2.PodsMetricSourceType: + if metric.Pods != nil { + return metric.Pods.Metric.Name + } + case autoscalingv2.ObjectMetricSourceType: + if metric.Object != nil { + return fmt.Sprintf("%s/%s/%s", metric.Object.DescribedObject.Kind, metric.Object.DescribedObject.Name, metric.Object.Metric.Name) + } + case autoscalingv2.ExternalMetricSourceType: + if metric.External != nil { + return metric.External.Metric.Name + } + } + return "unknown" +} + +func formatMetricTarget(metric autoscalingv2.MetricSpec) string { + switch metric.Type { + case autoscalingv2.ResourceMetricSourceType: + if metric.Resource != nil { + return formatTarget(metric.Resource.Target) + } + case autoscalingv2.ContainerResourceMetricSourceType: + if metric.ContainerResource != nil { + return formatTarget(metric.ContainerResource.Target) + } + case autoscalingv2.PodsMetricSourceType: + if metric.Pods != nil { + return formatTarget(metric.Pods.Target) + } + case autoscalingv2.ObjectMetricSourceType: + if metric.Object != nil { + return formatTarget(metric.Object.Target) + } + case autoscalingv2.ExternalMetricSourceType: + if metric.External != nil { + return formatTarget(metric.External.Target) + } + } + return "" +} + +func formatTarget(target autoscalingv2.MetricTarget) string { + switch target.Type { + case autoscalingv2.UtilizationMetricType: + if target.AverageUtilization != nil { + return fmt.Sprintf("%d%% utilization", *target.AverageUtilization) + } + case autoscalingv2.ValueMetricType: + if target.Value != nil { + return target.Value.String() + } + case autoscalingv2.AverageValueMetricType: + if target.AverageValue != nil { + return target.AverageValue.String() + " average" + } + } + return "" +} + +func formatMetricCurrent(metric autoscalingv2.MetricStatus) string { + switch metric.Type { + case autoscalingv2.ResourceMetricSourceType: + if metric.Resource != nil { + return formatCurrent(metric.Resource.Current) + } + case autoscalingv2.ContainerResourceMetricSourceType: + if metric.ContainerResource != nil { + return formatCurrent(metric.ContainerResource.Current) + } + case autoscalingv2.PodsMetricSourceType: + if metric.Pods != nil { + return formatCurrent(metric.Pods.Current) + } + case autoscalingv2.ObjectMetricSourceType: + if metric.Object != nil { + return formatCurrent(metric.Object.Current) + } + case autoscalingv2.ExternalMetricSourceType: + if metric.External != nil { + return formatCurrent(metric.External.Current) + } + } + return "" +} + +func formatCurrent(current autoscalingv2.MetricValueStatus) string { + if current.AverageUtilization != nil { + return fmt.Sprintf("%d%% utilization", *current.AverageUtilization) + } + if current.AverageValue != nil { + return current.AverageValue.String() + " average" + } + if current.Value != nil { + return current.Value.String() + } + return "" +} diff --git a/pkg/resourcecontext/build.go b/pkg/resourcecontext/build.go index ef793a946..a59ce0d27 100644 --- a/pkg/resourcecontext/build.go +++ b/pkg/resourcecontext/build.go @@ -18,6 +18,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "github.com/skyhook-io/radar/pkg/hpadiag" "github.com/skyhook-io/radar/pkg/topology" ) @@ -270,6 +271,7 @@ func Build(ctx context.Context, obj runtime.Object, opts Options) *ResourceConte rc.PVCSummary = buildPVCSummary(obj) rc.JobSummary = buildJobSummary(obj) rc.CronJobSummary = buildCronJobSummary(ctx, obj, opts.AccessChecker, omitted) + rc.HPASummary = buildHPASummary(obj) rc.StatusSummary = buildStatusSummary(obj) // 4. Pre-computed summaries — pass-through. @@ -1129,6 +1131,52 @@ func buildCronJobSummary(ctx context.Context, obj runtime.Object, ac RefAccessCh return out } +func buildHPASummary(obj runtime.Object) *HPASummary { + hpa, ok := obj.(*autoscalingv2.HorizontalPodAutoscaler) + if !ok || hpa == nil { + return nil + } + diagnosis := hpadiag.Analyze(hpa) + if diagnosis == nil { + return nil + } + out := &HPASummary{ + State: string(diagnosis.State), + Summary: diagnosis.Summary, + Target: &ContextRef{ + Kind: diagnosis.Target.Kind, + Group: groupFromAPIVersion(diagnosis.Target.APIVersion), + Namespace: hpa.Namespace, + Name: diagnosis.Target.Name, + }, + Bounds: &HPAReplicaBounds{ + Min: diagnosis.Bounds.Min, + Max: diagnosis.Bounds.Max, + Current: diagnosis.Bounds.Current, + Desired: diagnosis.Bounds.Desired, + ObservedGeneration: diagnosis.Bounds.ObservedGeneration, + Generation: diagnosis.Bounds.Generation, + }, + } + for _, metric := range diagnosis.Metrics { + out.Metrics = append(out.Metrics, HPAMetricSummary{ + Type: metric.Type, + Name: metric.Name, + Current: metric.Current, + Target: metric.Target, + Status: metric.Status, + }) + } + for _, reason := range diagnosis.Reasons { + out.Reasons = append(out.Reasons, HPAReasonSummary{ + ID: string(reason.ID), + Message: reason.Message, + Detail: reason.Detail, + }) + } + return out +} + func replicasOrZero(p *int32) int32 { if p == nil { return 0 diff --git a/pkg/resourcecontext/build_test.go b/pkg/resourcecontext/build_test.go index 91304b507..5f5426c31 100644 --- a/pkg/resourcecontext/build_test.go +++ b/pkg/resourcecontext/build_test.go @@ -794,7 +794,19 @@ func TestBuild_NilObj(t *testing.T) { func TestBuild_HPA_Identity(t *testing.T) { hpa := &autoscalingv2.HorizontalPodAutoscaler{ - ObjectMeta: metav1.ObjectMeta{Name: "web-hpa", Namespace: "prod"}, + ObjectMeta: metav1.ObjectMeta{Name: "web-hpa", Namespace: "prod", Generation: 1}, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{APIVersion: "apps/v1", Kind: "Deployment", Name: "web"}, + MaxReplicas: 10, + }, + Status: autoscalingv2.HorizontalPodAutoscalerStatus{ + ObservedGeneration: ptrInt64(1), + CurrentReplicas: 10, + DesiredReplicas: 10, + Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{ + {Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"}, + }, + }, } rc := Build(context.Background(), hpa, Options{Tier: TierBasic, AccessChecker: allowAllChecker{}}) if rc == nil { @@ -803,6 +815,15 @@ func TestBuild_HPA_Identity(t *testing.T) { if rc.Tier != TierBasic { t.Errorf("Tier: got %q want %q", rc.Tier, TierBasic) } + if rc.HPASummary == nil { + t.Fatal("Build returned no HPA summary") + } + if rc.HPASummary.State != "limited_max" { + t.Fatalf("HPA state = %q, want limited_max", rc.HPASummary.State) + } + if rc.HPASummary.Target == nil || rc.HPASummary.Target.Kind != "Deployment" || rc.HPASummary.Target.Group != "apps" { + t.Fatalf("target = %+v", rc.HPASummary.Target) + } } func TestBuild_PolicyReports_BasicTierCountsOnly(t *testing.T) { @@ -1095,6 +1116,8 @@ func TestBuild_SecretReferencedByCapsAndSkipsOwnedPods(t *testing.T) { func ptrBool(b bool) *bool { return &b } +func ptrInt64(v int64) *int64 { return &v } + func stringSlicesEqual(a, b []string) bool { if len(a) != len(b) { return false diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index 146733362..dc8f49390 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -46,6 +46,7 @@ type ResourceContext struct { PVCSummary *PVCSummary `json:"pvcSummary,omitempty"` JobSummary *JobSummary `json:"jobSummary,omitempty"` CronJobSummary *CronJobSummary `json:"cronJobSummary,omitempty"` + HPASummary *HPASummary `json:"hpaSummary,omitempty"` IssueSummary *IssueSummary `json:"issueSummary,omitempty"` AuditSummary *AuditSummary `json:"auditSummary,omitempty"` PolicySummary *PolicySummary `json:"policySummary,omitempty"` @@ -290,6 +291,38 @@ type CronJobSummary struct { LastSuccessfulTime string `json:"lastSuccessfulTime,omitempty"` } +type HPASummary struct { + State string `json:"state"` + Summary string `json:"summary"` + Target *ContextRef `json:"target,omitempty"` + Bounds *HPAReplicaBounds `json:"bounds,omitempty"` + Metrics []HPAMetricSummary `json:"metrics,omitempty"` + Reasons []HPAReasonSummary `json:"reasons,omitempty"` +} + +type HPAReplicaBounds struct { + Min int32 `json:"min"` + Max int32 `json:"max"` + Current int32 `json:"current"` + Desired int32 `json:"desired"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + Generation int64 `json:"generation,omitempty"` +} + +type HPAReasonSummary struct { + ID string `json:"id"` + Message string `json:"message"` + Detail string `json:"detail,omitempty"` +} + +type HPAMetricSummary struct { + Type string `json:"type"` + Name string `json:"name"` + Current string `json:"current,omitempty"` + Target string `json:"target,omitempty"` + Status string `json:"status"` +} + // IssueSummary is a rollup of internal issue-engine findings scoped to // the subject resource. Pre-computed by callers and passed into the // generator — this package does not import internal/issues. diff --git a/pkg/topology/types.go b/pkg/topology/types.go index 945336b30..0b8375deb 100644 --- a/pkg/topology/types.go +++ b/pkg/topology/types.go @@ -12,6 +12,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/skyhook-io/radar/pkg/hpadiag" k8score "github.com/skyhook-io/radar/pkg/k8score" ) @@ -358,6 +359,7 @@ type ResourceWithRelationships struct { Resource any `json:"resource"` Relationships *Relationships `json:"relationships,omitempty"` CertificateInfo *SecretCertificateInfo `json:"certificateInfo,omitempty"` + HPADiagnosis *hpadiag.Diagnosis `json:"hpaDiagnosis,omitempty"` } // ResourceStatus holds computed status for a resource. diff --git a/testdata/hpa-diagnosis/cases.json b/testdata/hpa-diagnosis/cases.json new file mode 100644 index 000000000..fb3389543 --- /dev/null +++ b/testdata/hpa-diagnosis/cases.json @@ -0,0 +1,308 @@ +[ + { + "name": "stable", + "expectedState": "ok", + "expectedReasons": [], + "expectedSummary": "HPA is within configured bounds", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "web", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "web" }, + "minReplicas": 2, + "maxReplicas": 10, + "metrics": [ + { "type": "Resource", "resource": { "name": "cpu", "target": { "type": "Utilization", "averageUtilization": 70 } } } + ] + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 4, + "desiredReplicas": 4, + "currentMetrics": [ + { "type": "Resource", "resource": { "name": "cpu", "current": { "averageUtilization": 55 } } } + ], + "conditions": [ + { "type": "AbleToScale", "status": "True", "reason": "SucceededGetScale" }, + { "type": "ScalingActive", "status": "True", "reason": "ValidMetricFound" }, + { "type": "ScalingLimited", "status": "False", "reason": "DesiredWithinRange" } + ] + } + } + }, + { + "name": "maxed", + "expectedState": "limited_max", + "expectedReasons": ["limited_max"], + "expectedSummary": "HPA wants more replicas but is capped at maxReplicas=10", + "expectedTableState": "limited_max", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "web", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "web" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 10, + "desiredReplicas": 10, + "conditions": [ + { "type": "ScalingLimited", "status": "True", "reason": "TooManyReplicas", "message": "the desired replica count is more than the maximum replica count" } + ] + } + } + }, + { + "name": "at max without limit condition", + "expectedState": "ok", + "expectedReasons": [], + "expectedSummary": "HPA is within configured bounds", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "web", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "web" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 10, + "desiredReplicas": 10 + } + } + }, + { + "name": "metrics unavailable", + "expectedState": "metrics_unavailable", + "expectedReasons": ["metrics_unavailable"], + "expectedSummary": "Add CPU requests to the target pods so HPA can compute replicas", + "expectedTableState": "metrics_unavailable", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "api", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "api" }, + "minReplicas": 2, + "maxReplicas": 10, + "metrics": [ + { "type": "Resource", "resource": { "name": "cpu", "target": { "type": "Utilization", "averageUtilization": 80 } } } + ] + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 3, + "desiredReplicas": 3, + "conditions": [ + { "type": "ScalingActive", "status": "False", "reason": "FailedGetResourceMetric", "message": "missing request for cpu" } + ] + } + } + }, + { + "name": "partial metrics missing", + "expectedState": "metrics_incomplete", + "expectedReasons": ["missing_current_metric"], + "expectedSummary": "HPA is missing current metric values for memory", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "api", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "api" }, + "minReplicas": 2, + "maxReplicas": 10, + "metrics": [ + { "type": "Resource", "resource": { "name": "cpu", "target": { "type": "Utilization", "averageUtilization": 80 } } }, + { "type": "Resource", "resource": { "name": "memory", "target": { "type": "Utilization", "averageUtilization": 70 } } } + ] + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 3, + "desiredReplicas": 3, + "currentMetrics": [ + { "type": "Resource", "resource": { "name": "cpu", "current": { "averageUtilization": 65 } } } + ] + } + } + }, + { + "name": "unable to scale", + "expectedState": "unable_to_scale", + "expectedReasons": ["unable_to_scale"], + "expectedSummary": "HPA cannot read or update the target scale", + "expectedTableState": "unable_to_scale", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "api", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "api" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 3, + "desiredReplicas": 3, + "conditions": [ + { "type": "AbleToScale", "status": "False", "reason": "FailedGetScale", "message": "deployments/scale.apps \"api\" not found" } + ] + } + } + }, + { + "name": "scaling disabled at zero replicas", + "expectedState": "disabled", + "expectedReasons": ["scaling_disabled"], + "expectedSummary": "HPA scaling is disabled because the target has zero replicas", + "expectedTableState": "disabled", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "paused", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "paused" }, + "minReplicas": 1, + "maxReplicas": 10, + "metrics": [ + { "type": "Resource", "resource": { "name": "cpu", "target": { "type": "Utilization", "averageUtilization": 70 } } } + ] + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 0, + "desiredReplicas": 0, + "conditions": [ + { "type": "ScalingActive", "status": "False", "reason": "ScalingDisabled", "message": "scaling is disabled since the replica count of the target is zero" } + ] + } + } + }, + { + "name": "pinned replicas", + "expectedState": "pinned", + "expectedReasons": ["pinned"], + "expectedSummary": "HPA is configured for a fixed replica count of 5", + "expectedTableState": "pinned", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "fixed", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "fixed" }, + "minReplicas": 5, + "maxReplicas": 5 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 5, + "desiredReplicas": 5 + } + } + }, + { + "name": "scaling up", + "expectedState": "scaling_up", + "expectedReasons": ["scaling_up"], + "expectedSummary": "Scaling up from 4 to 8 replicas", + "expectedTableState": "scaling_up", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "worker", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "worker" }, + "minReplicas": 1, + "maxReplicas": 20 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 4, + "desiredReplicas": 8 + } + } + }, + { + "name": "stale status", + "expectedState": "stale", + "expectedReasons": ["stale_status"], + "expectedSummary": "HPA has not observed the latest spec generation yet", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "web", "namespace": "default", "generation": 3 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "web" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 2, + "currentReplicas": 4, + "desiredReplicas": 4 + } + } + }, + { + "name": "min limited", + "expectedState": "limited_min", + "expectedReasons": ["limited_min"], + "expectedSummary": "HPA is holding at minReplicas=2", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "idle", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "idle" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 2, + "desiredReplicas": 2, + "conditions": [ + { "type": "ScalingLimited", "status": "True", "reason": "TooFewReplicas", "message": "the desired replica count is less than the minimum replica count" } + ] + } + } + }, + { + "name": "scale down stabilized", + "expectedState": "stabilized", + "expectedReasons": ["scale_down_stabilized"], + "expectedSummary": "HPA is holding replicas during scale-down stabilization", + "expectedTableState": "ok", + "hpa": { + "apiVersion": "autoscaling/v2", + "kind": "HorizontalPodAutoscaler", + "metadata": { "name": "web", "namespace": "default", "generation": 1 }, + "spec": { + "scaleTargetRef": { "apiVersion": "apps/v1", "kind": "Deployment", "name": "web" }, + "minReplicas": 2, + "maxReplicas": 10 + }, + "status": { + "observedGeneration": 1, + "currentReplicas": 5, + "desiredReplicas": 5, + "conditions": [ + { "type": "ScalingLimited", "status": "True", "reason": "ScaleDownStabilized", "message": "recent recommendations were higher than current one" } + ] + } + } + } +] diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 8317c2422..733a68c25 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -928,6 +928,7 @@ export function useResource(kind: string, namespace: string, name: string, gr data: query.data?.resource, relationships: query.data?.relationships, certificateInfo: query.data?.certificateInfo, + hpaDiagnosis: query.data?.hpaDiagnosis, } } diff --git a/web/src/components/resources/renderers/HPARenderer.tsx b/web/src/components/resources/renderers/HPARenderer.tsx index 8952efbdd..b8c42eafc 100644 --- a/web/src/components/resources/renderers/HPARenderer.tsx +++ b/web/src/components/resources/renderers/HPARenderer.tsx @@ -1,16 +1,19 @@ import { HPARenderer as BaseHPARenderer } from '@skyhook-io/k8s-ui/components/resources/renderers/HPARenderer' import { HPACharts } from '../../resource/HPACharts' +import type { HPADiagnosis } from '@skyhook-io/k8s-ui' interface HPARendererProps { data: any onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void + hpaDiagnosis?: HPADiagnosis } -export function HPARenderer({ data, onNavigate }: HPARendererProps) { +export function HPARenderer({ data, onNavigate, hpaDiagnosis }: HPARendererProps) { return ( } /> ) diff --git a/web/src/components/resources/renderers/WorkloadRenderer.tsx b/web/src/components/resources/renderers/WorkloadRenderer.tsx index 72226b52a..c914f2e17 100644 --- a/web/src/components/resources/renderers/WorkloadRenderer.tsx +++ b/web/src/components/resources/renderers/WorkloadRenderer.tsx @@ -1,9 +1,11 @@ import { WorkloadRenderer as BaseWorkloadRenderer } from '@skyhook-io/k8s-ui/components/resources/renderers/WorkloadRenderer' import { useNavigate } from 'react-router-dom' -import { useScaleWorkload } from '../../../api/client' +import { useScaleWorkload, fetchJSON } from '../../../api/client' import { useRBACSubject } from '../../../api/rbac' -import { useQueryClient } from '@tanstack/react-query' -import type { Relationships, ResourceRef } from '../../../types' +import { useQueries, useQueryClient } from '@tanstack/react-query' +import { kindToPlural } from '@skyhook-io/k8s-ui/utils/navigation' +import type { Relationships, ResourceRef, ResourceWithRelationships } from '../../../types' +import type { ScalerDiagnosis } from '@skyhook-io/k8s-ui/components/resources/renderers/WorkloadRenderer' // Map plural lowercase kind to singular PascalCase for ownerReferences matching function getOwnerKind(kind: string): string { @@ -40,6 +42,34 @@ export function WorkloadRenderer({ kind, data, onNavigate, scaleBlockedBy }: Wor const { data: rbacData, isLoading: rbacLoading, error: rbacError } = useRBACSubject( 'ServiceAccount', namespace, saName, !!namespace, ) + const hpaRefs = (scaleBlockedBy ?? []).filter(ref => { + const refKind = ref.kind.toLowerCase() + return refKind === 'horizontalpodautoscaler' || refKind === 'hpa' + }) + const hpaQueries = useQueries({ + queries: hpaRefs.map(ref => ({ + queryKey: ['resource', kindToPlural(ref.kind), ref.namespace, ref.name, ref.group], + queryFn: () => { + const ns = ref.namespace || '_' + const params = new URLSearchParams() + if (ref.group) params.set('group', ref.group) + const query = params.toString() + return fetchJSON>(`/resources/${kindToPlural(ref.kind)}/${ns}/${ref.name}${query ? `?${query}` : ''}`) + }, + enabled: Boolean(ref.kind && ref.name), + staleTime: 10000, + retry: false, + })), + }) + const scalerDiagnostics: ScalerDiagnosis[] = hpaRefs.map((ref, index) => { + const query = hpaQueries[index] + return { + ref, + diagnosis: query.data?.hpaDiagnosis, + loading: query.isLoading, + error: query.isError ? (query.error instanceof Error ? query.error.message : 'Failed to fetch HPA') : undefined, + } + }) return ( { await scaleMutation.mutateAsync({ kind, diff --git a/web/src/components/workload/WorkloadView.tsx b/web/src/components/workload/WorkloadView.tsx index fdd824e23..4effd59c5 100644 --- a/web/src/components/workload/WorkloadView.tsx +++ b/web/src/components/workload/WorkloadView.tsx @@ -270,6 +270,7 @@ export function WorkloadView({ const resource = resourceResponse?.resource const relationships = resourceResponse?.relationships const certificateInfo = resourceResponse?.certificateInfo + const hpaDiagnosis = resourceResponse?.hpaDiagnosis const relationshipGitopsOwner = useMemo(() => gitOpsOwnerFromRelationships(relationships), [relationships]) const inheritedGitOpsLookupRef = useMemo( () => findInheritedGitOpsLookupRef(relationships, relationshipGitopsOwner, { kind: kindProp, namespace, name, group: rest.group }), @@ -478,6 +479,7 @@ export function WorkloadView({ resource={resource} relationships={relationships} certificateInfo={certificateInfo} + hpaDiagnosis={hpaDiagnosis} isLoading={resourceLoading} resourceError={resourceError} refetch={refetchResource}