Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions internal/k8s/detect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ func TestDetectProblems_PopulatesGroup(t *testing.T) {
NumberUnavailable: 2,
},
},
// HPA at its replica ceiling — DetectHPAProblems flags
// "maxed" when current and desired both hit MaxReplicas.
// HPA capped by maxReplicas — DetectHPAProblems flags
// "maxed" when the controller reports TooManyReplicas.
// The wrapper sets Group="autoscaling".
&autoscalingv2.HorizontalPodAutoscaler{
ObjectMeta: metav1.ObjectMeta{Name: "api", Namespace: "prod"},
Expand All @@ -78,6 +78,9 @@ func TestDetectProblems_PopulatesGroup(t *testing.T) {
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 10,
DesiredReplicas: 10,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"},
},
},
},
// Job stuck Active>0 for >1h with no completions.
Expand Down
80 changes: 53 additions & 27 deletions internal/k8s/detect_workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ import (

autoscalingv2 "k8s.io/api/autoscaling/v2"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"

"github.com/skyhook-io/radar/pkg/hpadiag"
)

// HPAProblem describes a detected issue with an HPA.
type HPAProblem struct {
Name string
Namespace string
Problem string // "maxed"
Problem string // "maxed" or "cannot-scale"
Reason string
}

Expand All @@ -29,42 +30,67 @@ type HPAProblem struct {
func DetectHPAProblems(hpas []*autoscalingv2.HorizontalPodAutoscaler) []HPAProblem {
var problems []HPAProblem
for _, hpa := range hpas {
// "maxed" — at replica ceiling and wanting more.
if hpa.Spec.MaxReplicas > 0 && hpa.Status.CurrentReplicas >= hpa.Spec.MaxReplicas && hpa.Status.DesiredReplicas >= hpa.Spec.MaxReplicas {
diagnosis := hpadiag.Analyze(hpa)
if diagnosis == nil {
continue
}

if reason, ok := firstHPAReason(diagnosis, hpadiag.ReasonLimitedMax); ok {
problems = append(problems, HPAProblem{
Name: hpa.Name,
Namespace: hpa.Namespace,
Problem: "maxed",
Reason: fmt.Sprintf("%d/%d replicas (wants %d)", hpa.Status.CurrentReplicas, hpa.Spec.MaxReplicas, hpa.Status.DesiredReplicas),
Reason: maxedReasonText(diagnosis, reason),
})
}
// "cannot scale" — the autoscaler controller reports it can't get
// metrics or scale calls are failing. Emitted as a separate problem
// so the maxed-check above isn't masked by an unrelated metrics
// outage on the same HPA.
for _, cond := range hpa.Status.Conditions {
if cond.Type == autoscalingv2.ScalingActive && cond.Status == corev1.ConditionFalse {
reason := cond.Reason
if reason == "" {
reason = "ScalingActive=False"
}
msg := cond.Message
if msg == "" {
msg = "HPA controller reports it cannot scale this workload"
}
problems = append(problems, HPAProblem{
Name: hpa.Name,
Namespace: hpa.Namespace,
Problem: "cannot-scale",
Reason: fmt.Sprintf("%s: %s", reason, msg),
})
break
}

if reason, ok := firstHPAReason(diagnosis, hpadiag.ReasonUnableToScale, hpadiag.ReasonMetricsUnavailable); ok {
problems = append(problems, HPAProblem{
Name: hpa.Name,
Namespace: hpa.Namespace,
Problem: "cannot-scale",
Reason: reasonText(reason),
})
Comment thread
cursor[bot] marked this conversation as resolved.
}
}
return problems
}

func firstHPAReason(diagnosis *hpadiag.Diagnosis, ids ...hpadiag.ReasonID) (hpadiag.Reason, bool) {
for _, id := range ids {
for _, reason := range diagnosis.Reasons {
if reason.ID == id {
return reason, true
}
}
}
return hpadiag.Reason{}, false
}

func reasonText(reason hpadiag.Reason) string {
if reason.ConditionReason != "" && reason.Message != "" {
return reason.ConditionReason + ": " + reason.Message
}
if reason.Message != "" {
return reason.Message
}
return string(reason.ID)
}

func maxedReasonText(diagnosis *hpadiag.Diagnosis, reason hpadiag.Reason) string {
if diagnosis == nil || diagnosis.Bounds.Max <= 0 {
return reasonText(reason)
}
text := fmt.Sprintf("%d/%d replicas", diagnosis.Bounds.Current, diagnosis.Bounds.Max)
if diagnosis.Bounds.Desired > 0 {
text += fmt.Sprintf(" (wants %d)", diagnosis.Bounds.Desired)
}
if detail := reasonText(reason); detail != "" {
return text + ": " + detail
}
return text
}

// CronJobProblem describes a detected issue with a CronJob.
type CronJobProblem struct {
Name string
Expand Down
144 changes: 140 additions & 4 deletions internal/k8s/detect_workload_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package k8s

import (
"strings"
"testing"
"time"

autoscalingv2 "k8s.io/api/autoscaling/v2"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"testing"
"time"
)

func TestDetectHPAProblems(t *testing.T) {
Expand All @@ -14,18 +17,37 @@ func TestDetectHPAProblems(t *testing.T) {
hpas []*autoscalingv2.HorizontalPodAutoscaler
wantCount int
wantProblem string
wantReason string
}{
{
name: "maxed HPA",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{CurrentReplicas: 10, DesiredReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 10,
DesiredReplicas: 10,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"},
},
},
},
},
wantCount: 1,
wantProblem: "maxed",
wantReason: "10/10 replicas (wants 10): TooManyReplicas: the desired replica count is more than the maximum replica count",
},
{
name: "at max without controller limit condition is not maxed",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{CurrentReplicas: 10, DesiredReplicas: 10},
},
},
wantCount: 0,
},
{
name: "not maxed",
Expand Down Expand Up @@ -60,6 +82,113 @@ func TestDetectHPAProblems(t *testing.T) {
},
wantCount: 0,
},
{
name: "metrics unavailable",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 5,
DesiredReplicas: 5,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "FailedGetResourceMetric", Message: "missing cpu request"},
},
},
},
},
wantCount: 1,
wantProblem: "cannot-scale",
},
{
name: "maxed and metrics unavailable emit two distinct issues",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 10,
DesiredReplicas: 10,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "FailedGetResourceMetric", Message: "missing cpu request"},
{Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooManyReplicas", Message: "the desired replica count is more than the maximum replica count"},
},
},
},
},
wantCount: 2,
},
{
name: "scaling disabled is not a metrics outage",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "paused", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 0,
DesiredReplicas: 0,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingActive, Status: corev1.ConditionFalse, Reason: "ScalingDisabled", Message: "scaling is disabled since the replica count of the target is zero"},
},
},
},
},
wantCount: 0,
},
{
name: "pinned min equals max is not maxed",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "fixed", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
MinReplicas: ptrInt32(5),
MaxReplicas: 5,
},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 5,
DesiredReplicas: 5,
},
},
},
wantCount: 0,
},
{
name: "min limited is drawer context only",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "idle", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
MinReplicas: ptrInt32(2),
MaxReplicas: 10,
},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 2,
DesiredReplicas: 2,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "TooFewReplicas", Message: "the desired replica count is less than the minimum replica count"},
},
},
},
},
wantCount: 0,
},
{
name: "scale down stabilization is drawer context only",
hpas: []*autoscalingv2.HorizontalPodAutoscaler{
{
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "default"},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{MaxReplicas: 10},
Status: autoscalingv2.HorizontalPodAutoscalerStatus{
CurrentReplicas: 5,
DesiredReplicas: 5,
Conditions: []autoscalingv2.HorizontalPodAutoscalerCondition{
{Type: autoscalingv2.ScalingLimited, Status: corev1.ConditionTrue, Reason: "ScaleDownStabilized"},
},
},
},
},
wantCount: 0,
},
}

for _, tt := range tests {
Expand All @@ -68,15 +197,22 @@ func TestDetectHPAProblems(t *testing.T) {
if len(problems) != tt.wantCount {
t.Errorf("DetectHPAProblems() returned %d problems, want %d", len(problems), tt.wantCount)
}
if tt.wantCount > 0 && len(problems) > 0 {
if tt.wantProblem != "" && len(problems) > 0 {
if problems[0].Problem != tt.wantProblem {
t.Errorf("problem = %q, want %q", problems[0].Problem, tt.wantProblem)
}
}
if tt.wantReason != "" && len(problems) > 0 && !strings.Contains(problems[0].Reason, tt.wantReason) {
t.Errorf("reason = %q, want to contain %q", problems[0].Reason, tt.wantReason)
}
})
}
}

func ptrInt32(v int32) *int32 {
return &v
}

func TestDetectCronJobProblems(t *testing.T) {
now := time.Now()
suspended := true
Expand Down
12 changes: 12 additions & 0 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/go-chi/cors"
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -42,6 +43,7 @@ import (
"github.com/skyhook-io/radar/internal/timeline"
"github.com/skyhook-io/radar/internal/updater"
"github.com/skyhook-io/radar/internal/version"
"github.com/skyhook-io/radar/pkg/hpadiag"
"github.com/skyhook-io/radar/pkg/perfstats"
"github.com/skyhook-io/radar/pkg/rbac"
topology "github.com/skyhook-io/radar/pkg/topology"
Expand Down Expand Up @@ -1587,6 +1589,14 @@ func setTypeMeta(resource any) {
k8s.SetTypeMeta(resource)
}

func hpaDiagnosisFor(resource any) *hpadiag.Diagnosis {
hpa, ok := resource.(*autoscalingv2.HorizontalPodAutoscaler)
if !ok {
return nil
}
return hpadiag.Analyze(hpa)
}

// preflightResourceGet runs the per-user RBAC gates that must pass before any
// single-resource GET fetch. Mirrors the kind/scope-aware logic used by both
// the REST handler (handleGetResource) and the AI handler (handleAIGetResource)
Expand Down Expand Up @@ -1726,6 +1736,7 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) {
s.writeJSON(w, topology.ResourceWithRelationships{
Resource: resource,
Relationships: relationships,
HPADiagnosis: hpaDiagnosisFor(resource),
})
return
}
Expand Down Expand Up @@ -1941,6 +1952,7 @@ func (s *Server) handleGetResource(w http.ResponseWriter, r *http.Request) {
response := topology.ResourceWithRelationships{
Resource: resource,
Relationships: relationships,
HPADiagnosis: hpaDiagnosisFor(resource),
}

// Enrich TLS secrets with parsed certificate info
Expand Down
Loading
Loading