diff --git a/internal/prometheus/auth.go b/internal/prometheus/auth.go new file mode 100644 index 000000000..6d141d3f1 --- /dev/null +++ b/internal/prometheus/auth.go @@ -0,0 +1,40 @@ +package prometheus + +import ( + "net/http" + "sync/atomic" +) + +// AuthGate is the per-request resource read check used by handlers that read +// K8s spec data via the shared informer cache. The cache is populated using +// Radar's service-account permissions, so without this gate any authenticated +// user could fetch any namespace's spec by guessing names. Server.canRead is +// the concrete implementation; passing it via SetAuthGate avoids an import +// cycle (server imports prometheus, not the other way around). +// +// Implementations should return true when auth is disabled or the user can +// read the resource; false to refuse with 403. +type AuthGate func(r *http.Request, group, resource, namespace, verb string) bool + +var authGate atomic.Pointer[AuthGate] + +// SetAuthGate installs the request-scoped authorization check. Pass nil to +// disable gating (only appropriate for tests). +func SetAuthGate(fn AuthGate) { + if fn == nil { + authGate.Store(nil) + return + } + authGate.Store(&fn) +} + +// canRead consults the installed AuthGate. Returns true when no gate is +// installed (e.g. tests, transitional state during init) so the gate is +// strictly additive — never accidentally locks out the OSS no-auth path. +func canRead(r *http.Request, group, resource, namespace, verb string) bool { + g := authGate.Load() + if g == nil { + return true + } + return (*g)(r, group, resource, namespace, verb) +} diff --git a/internal/prometheus/handlers.go b/internal/prometheus/handlers.go index 543f864a1..8b223b406 100644 --- a/internal/prometheus/handlers.go +++ b/internal/prometheus/handlers.go @@ -26,6 +26,8 @@ func RegisterRoutes(r chi.Router) { r.Get("/prometheus/namespace/{namespace}", handleNamespaceMetrics) r.Get("/prometheus/cluster", handleClusterMetrics) r.Get("/prometheus/query", handleRawQuery) + r.Get("/prometheus/pvc/{namespace}/{name}", handlePVCUsage) + r.Get("/prometheus/rightsizing/{kind}/{namespace}/{name}", handleRightsizing) } func writeJSON(w http.ResponseWriter, status int, v interface{}) { diff --git a/internal/prometheus/queries.go b/internal/prometheus/queries.go index 4015bcaf2..c8fecbfc7 100644 --- a/internal/prometheus/queries.go +++ b/internal/prometheus/queries.go @@ -32,11 +32,14 @@ const ( CategoryNetworkRX MetricCategory = "network_rx" CategoryNetworkTX MetricCategory = "network_tx" CategoryFilesystem MetricCategory = "filesystem" + // CategoryRestarts is sourced from KSM and represents the rate-of-change + // of container restart counters; gracefully degrades when KSM isn't scraped. + CategoryRestarts MetricCategory = "restarts" ) // AllCategories returns all metric categories in display order. func AllCategories() []MetricCategory { - return []MetricCategory{CategoryCPU, CategoryMemory, CategoryNetworkRX, CategoryNetworkTX, CategoryFilesystem} + return []MetricCategory{CategoryCPU, CategoryMemory, CategoryNetworkRX, CategoryNetworkTX, CategoryFilesystem, CategoryRestarts} } // CategoryLabel returns a human-readable label for a metric category. @@ -52,6 +55,8 @@ func CategoryLabel(cat MetricCategory) string { return "Network Transmitted" case CategoryFilesystem: return "Filesystem" + case CategoryRestarts: + return "Restarts" default: return string(cat) } @@ -70,6 +75,8 @@ func CategoryUnit(cat MetricCategory) string { return "bytes/s" case CategoryFilesystem: return "bytes/s" + case CategoryRestarts: + return "count" default: return "" } @@ -97,6 +104,8 @@ func SupportedKinds() []string { func CategoriesForKind(kind string) []MetricCategory { switch strings.ToLower(kind) { case "node": + // Nodes have neither workload restart semantics nor the network/filesystem + // container metrics — node-exporter covers them separately on the Node page. return []MetricCategory{CategoryCPU, CategoryMemory, CategoryFilesystem} default: return AllCategories() @@ -205,6 +214,13 @@ func buildPodQuery(namespace, podName string, category MetricCategory, filterCon } switch category { + case CategoryRestarts: + // changes() over a 1h window gives the count of restarts during that window; + // using a long window keeps the chart legible (most pods never restart). + // Sums across containers so a multi-container pod surfaces one line per pod. + return fmt.Sprintf( + `sum by (pod,namespace) (changes(kube_pod_container_status_restarts_total{namespace='%s',pod='%s'}[1h]))`, + ns, pod) case CategoryCPU: return fmt.Sprintf( `sum(rate(container_cpu_usage_seconds_total{%snamespace='%s',pod='%s'}[5m])) by (pod,namespace)`, @@ -240,6 +256,10 @@ func buildWorkloadQuery(namespace, workloadName string, category MetricCategory, } switch category { + case CategoryRestarts: + return fmt.Sprintf( + `sum by (pod,namespace) (changes(kube_pod_container_status_restarts_total{namespace='%s',pod=~'%s'}[1h]))`, + ns, podPattern) case CategoryCPU: return fmt.Sprintf( `sum(rate(container_cpu_usage_seconds_total{%snamespace='%s',pod=~'%s'}[5m])) by (pod,namespace)`, diff --git a/internal/prometheus/rightsizing.go b/internal/prometheus/rightsizing.go new file mode 100644 index 000000000..9be1b08d1 --- /dev/null +++ b/internal/prometheus/rightsizing.go @@ -0,0 +1,546 @@ +package prometheus + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + "time" + + "github.com/go-chi/chi/v5" + "github.com/skyhook-io/radar/internal/errorlog" + "github.com/skyhook-io/radar/internal/k8s" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +// Sentinel errors distinguish the cache-loading failure modes so handlers can +// map them to the right HTTP status. A user without `list deployments` would +// otherwise see "404 not found" for a workload they simply can't read, and +// conclude Radar is broken. +var ( + errCacheNotReady = errors.New("resource cache not initialized") + errKindRBACDenied = errors.New("kind not listable by service account") + errWorkloadMissing = errors.New("workload not found") +) + +// Tone is the recommendation severity used by the rightsizing UI. +// Deliberately mild vocabulary: most over/under-provisioning isn't a "problem," +// it's a tuning opportunity. We reserve "alert" for actual throttling/OOM risk. +type Tone string + +const ( + ToneOK Tone = "ok" // Well-sized, no action needed + ToneInfo Tone = "info" // Mildly over-provisioned, optional tightening + ToneWarning Tone = "warning" // Worth reviewing (significantly over, or no requests set) + ToneAlert Tone = "alert" // Throttling / approaching limit + ToneCritical Tone = "critical" // Active OOM risk (mem P95 ≥ limit) +) + +// RightsizingRow is one row of the rightsizing recommendation: a container × resource. +type RightsizingRow struct { + Container string `json:"container"` + Resource string `json:"resource"` // "cpu" | "memory" + CurrentRequest *string `json:"currentRequest,omitempty"` + CurrentLimit *string `json:"currentLimit,omitempty"` + P95 *string `json:"p95,omitempty"` + RecommendedReq *string `json:"recommendedRequest,omitempty"` + Tone Tone `json:"tone"` + Message string `json:"message"` +} + +// RightsizingResponse is the rightsizing endpoint response. +type RightsizingResponse struct { + Kind string `json:"kind"` + Namespace string `json:"namespace"` + Name string `json:"name"` + Window string `json:"window"` // e.g. "24h" + SampleAvailable bool `json:"sampleAvailable"` + Rows []RightsizingRow `json:"rows"` + Reason string `json:"reason,omitempty"` // populated when sampleAvailable=false +} + +const ( + rightsizingWindow = 24 * time.Hour + rightsizingHeadroomCPU = 1.15 // 15% headroom above P95 + rightsizingHeadroomMemory = 1.10 // memory P95 is already conservative +) + +// handleRightsizing returns rightsizing recommendations for a workload's containers. +// Only Deployment / StatefulSet / DaemonSet supported — per-pod rightsizing +// is wrong granularity (recs are per-container-template). +func handleRightsizing(w http.ResponseWriter, r *http.Request) { + client := GetClient() + if client == nil { + writeError(w, http.StatusServiceUnavailable, "Prometheus client not initialized") + return + } + + kind := chi.URLParam(r, "kind") + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + + if !isRightsizingKind(kind) { + writeError(w, http.StatusBadRequest, "rightsizing only supported for Deployment, StatefulSet, DaemonSet") + return + } + + // Per-user RBAC: the cache is populated under Radar's SA, so without this + // gate any authenticated user could fetch any namespace's container spec + // + P95 by guessing names. Use "get" — matches normal resource-detail reads. + resource := strings.ToLower(kind) + "s" + if !canRead(r, "apps", resource, namespace, "get") { + writeError(w, http.StatusForbidden, "forbidden") + return + } + + containers, err := loadWorkloadContainers(kind, namespace, name) + if err != nil { + switch { + case errors.Is(err, errCacheNotReady): + writeError(w, http.StatusServiceUnavailable, err.Error()) + case errors.Is(err, errKindRBACDenied): + writeError(w, http.StatusForbidden, err.Error()) + case errors.Is(err, errWorkloadMissing): + writeError(w, http.StatusNotFound, err.Error()) + default: + writeError(w, http.StatusInternalServerError, err.Error()) + } + return + } + if len(containers) == 0 { + writeJSON(w, http.StatusOK, RightsizingResponse{ + Kind: kind, Namespace: namespace, Name: name, + Window: "24h", SampleAvailable: false, + Rows: []RightsizingRow{}, + Reason: "Workload has no runtime containers (init-only or empty spec).", + }) + return + } + + resp := RightsizingResponse{ + Kind: kind, Namespace: namespace, Name: name, + Window: "24h", + SampleAvailable: true, + Rows: make([]RightsizingRow, 0, len(containers)*2), + } + + anyData := false + for _, c := range containers { + cpuRow := computeRightsizingRow(r.Context(), client, namespace, name, c, "cpu") + memRow := computeRightsizingRow(r.Context(), client, namespace, name, c, "memory") + if cpuRow.P95 != nil || memRow.P95 != nil { + anyData = true + } + resp.Rows = append(resp.Rows, cpuRow, memRow) + } + + if !anyData { + resp.SampleAvailable = false + resp.Reason = "No usage samples in the last 24h — workload may be too new, or Prometheus retention is short." + } + + writeJSON(w, http.StatusOK, resp) +} + +func isRightsizingKind(kind string) bool { + switch strings.ToLower(kind) { + case "deployment", "statefulset", "daemonset": + return true + } + return false +} + +type containerSpec struct { + name string + cpuReq *resource.Quantity + cpuLim *resource.Quantity + memReq *resource.Quantity + memLim *resource.Quantity +} + +// loadWorkloadContainers reads runtime container specs (excluding pure init, +// including native sidecars) from the K8s cache. Returns sentinel errors so +// the handler can map cache-not-ready to 503, RBAC-denied to 403, and only +// genuine misses to 404. +func loadWorkloadContainers(kind, namespace, name string) ([]containerSpec, error) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, errCacheNotReady + } + + var podTemplate *corev1.PodSpec + switch strings.ToLower(kind) { + case "deployment": + if cache.Deployments() == nil { + return nil, fmt.Errorf("%w: deployments", errKindRBACDenied) + } + d, err := cache.Deployments().Deployments(namespace).Get(name) + if err != nil { + return nil, fmt.Errorf("%w: deployment %s/%s", errWorkloadMissing, namespace, name) + } + podTemplate = &d.Spec.Template.Spec + case "statefulset": + if cache.StatefulSets() == nil { + return nil, fmt.Errorf("%w: statefulsets", errKindRBACDenied) + } + ss, err := cache.StatefulSets().StatefulSets(namespace).Get(name) + if err != nil { + return nil, fmt.Errorf("%w: statefulset %s/%s", errWorkloadMissing, namespace, name) + } + podTemplate = &ss.Spec.Template.Spec + case "daemonset": + if cache.DaemonSets() == nil { + return nil, fmt.Errorf("%w: daemonsets", errKindRBACDenied) + } + ds, err := cache.DaemonSets().DaemonSets(namespace).Get(name) + if err != nil { + return nil, fmt.Errorf("%w: daemonset %s/%s", errWorkloadMissing, namespace, name) + } + podTemplate = &ds.Spec.Template.Spec + } + + if podTemplate == nil { + return nil, errCacheNotReady + } + + return extractRuntimeContainers(podTemplate), nil +} + +// extractRuntimeContainers returns containers + native-sidecar init containers +// (initContainers with restartPolicy=Always, GA in 1.33). Native sidecars run +// for the pod's lifetime and must be included alongside regular containers; +// pure init containers run to completion and are excluded. +func extractRuntimeContainers(podSpec *corev1.PodSpec) []containerSpec { + containers := make([]containerSpec, 0, len(podSpec.Containers)) + for _, c := range podSpec.Containers { + containers = append(containers, extractContainerSpec(c)) + } + for _, c := range podSpec.InitContainers { + if c.RestartPolicy != nil && *c.RestartPolicy == corev1.ContainerRestartPolicyAlways { + containers = append(containers, extractContainerSpec(c)) + } + } + return containers +} + +func extractContainerSpec(c corev1.Container) containerSpec { + out := containerSpec{name: c.Name} + if q, ok := c.Resources.Requests[corev1.ResourceCPU]; ok { + qc := q.DeepCopy() + out.cpuReq = &qc + } + if q, ok := c.Resources.Limits[corev1.ResourceCPU]; ok { + qc := q.DeepCopy() + out.cpuLim = &qc + } + if q, ok := c.Resources.Requests[corev1.ResourceMemory]; ok { + qc := q.DeepCopy() + out.memReq = &qc + } + if q, ok := c.Resources.Limits[corev1.ResourceMemory]; ok { + qc := q.DeepCopy() + out.memLim = &qc + } + return out +} + +func computeRightsizingRow(ctx context.Context, client *Client, namespace, workload string, c containerSpec, resKind string) RightsizingRow { + row := RightsizingRow{ + Container: c.name, + Resource: resKind, + Tone: ToneOK, + Message: "", + } + + var req, lim *resource.Quantity + switch resKind { + case "cpu": + req, lim = c.cpuReq, c.cpuLim + case "memory": + req, lim = c.memReq, c.memLim + } + if req != nil { + s := req.String() + row.CurrentRequest = &s + } + if lim != nil { + s := lim.String() + row.CurrentLimit = &s + } + + p95, err := queryContainerP95(ctx, client, namespace, workload, c.name, resKind) + if err != nil || p95 == nil { + // No data — return row with what we know from spec but no recommendation. + // Skip the row entirely from the UI's perspective by leaving P95 nil; the + // frontend can still display current requests if it wants. + return row + } + + // Format P95 for display. + p95Str := formatRightsizingValue(*p95, resKind) + row.P95 = &p95Str + + classifyRightsizing(&row, *p95, req, lim, resKind) + return row +} + +// queryContainerP95 returns the P95 of a container's CPU/memory usage over the +// rightsizing window. Returns nil (no error) when there's no data. +func queryContainerP95(ctx context.Context, client *Client, namespace, workload, container, resKind string) (*float64, error) { + ns := SanitizeLabelValue(namespace) + podPattern := fmt.Sprintf("%s-.*", escapeRegexMeta(SanitizeLabelValue(workload))) + cn := SanitizeLabelValue(container) + windowSec := int64(rightsizingWindow.Seconds()) + + var query string + switch resKind { + case "cpu": + // P95 over 24h of 5min rates, max across pods (worst-case for sizing). + query = fmt.Sprintf( + `quantile_over_time(0.95, max by (container) (rate(container_cpu_usage_seconds_total{namespace='%s',pod=~'%s',container='%s'}[5m]))[%ds:5m])`, + ns, podPattern, cn, windowSec) + case "memory": + // Memory is a gauge — straight P95 of working set, max across pods. + query = fmt.Sprintf( + `quantile_over_time(0.95, max by (container) (container_memory_working_set_bytes{namespace='%s',pod=~'%s',container='%s'})[%ds:])`, + ns, podPattern, cn, windowSec) + default: + return nil, fmt.Errorf("unsupported resource: %s", resKind) + } + + res, err := client.Query(ctx, query) + if err != nil { + errorlog.Record("prometheus", "warning", "rightsizing P95 query failed for %s/%s/%s/%s: %v", namespace, workload, container, resKind, err) + return nil, err + } + if len(res.Series) == 0 || len(res.Series[0].DataPoints) == 0 { + return nil, nil + } + v := res.Series[0].DataPoints[0].Value + // Prom returns NaN as a float; treat as no data. + if v != v { + return nil, nil + } + return &v, nil +} + +// classifyRightsizing applies the tone + message + recommended request based +// on P95 vs current request/limit. Deliberately mild — most workloads are +// over-provisioned by 2-3x and we don't want to nag them about it. +func classifyRightsizing(row *RightsizingRow, p95 float64, req, lim *resource.Quantity, resKind string) { + // Hard rule: memory P95 ≥ limit is an active OOM risk regardless of headroom math. + if resKind == "memory" && lim != nil { + limVal := quantityToFloat(*lim, resKind) + if limVal > 0 && p95 >= limVal*0.95 { + row.Tone = ToneCritical + row.Message = "P95 near memory limit — active OOM risk" + if rec := recommendRequest(p95, resKind); rec != "" { + row.RecommendedReq = &rec + } + return + } + } + + // No request set — informational nudge, not an alarm. + if req == nil { + row.Tone = ToneWarning + row.Message = fmt.Sprintf("No %s request set — consider setting one based on observed usage", resKind) + if rec := recommendRequest(p95, resKind); rec != "" { + row.RecommendedReq = &rec + } + return + } + + reqVal := quantityToFloat(*req, resKind) + if reqVal <= 0 { + return + } + + // CPU-specific: P95 exceeds limit = active throttling. + if resKind == "cpu" && lim != nil { + limVal := quantityToFloat(*lim, resKind) + if limVal > 0 && p95 > limVal { + row.Tone = ToneAlert + row.Message = "P95 exceeds CPU limit — throttling likely" + if rec := recommendRequest(p95, resKind); rec != "" { + row.RecommendedReq = &rec + } + return + } + } + + ratio := reqVal / p95 + + // P95 exceeds request (but within limit) → throttled occasionally / no burst headroom. + if ratio < 1.0 { + row.Tone = ToneWarning + row.Message = fmt.Sprintf("P95 usage exceeds request (%.0f%% over)", (1.0/ratio-1.0)*100.0) + if rec := recommendRequest(p95, resKind); rec != "" { + row.RecommendedReq = &rec + } + return + } + + // Sensible headroom (1x-3x) — well-sized. No nag. + if ratio <= 3.0 { + row.Tone = ToneOK + row.Message = "Well-sized" + return + } + + // Significant over-provisioning thresholds chosen to avoid nagging the common + // "I requested 256Mi and use 100Mi" pattern (~2.5x — that's fine). + // CPU is bursty so we tolerate more headroom there than memory. + overThreshold := 5.0 + if resKind == "cpu" { + overThreshold = 8.0 + } + + if ratio > overThreshold { + row.Tone = ToneInfo + row.Message = fmt.Sprintf("Over-provisioned by %.1fx — could reduce", ratio) + if rec := recommendRequest(p95, resKind); rec != "" { + row.RecommendedReq = &rec + } + return + } + + // Between 3x and threshold — informational only, no recommendation. + row.Tone = ToneOK + row.Message = fmt.Sprintf("%.1fx headroom", ratio) +} + +func recommendRequest(p95 float64, resKind string) string { + headroom := rightsizingHeadroomCPU + if resKind == "memory" { + headroom = rightsizingHeadroomMemory + } + return formatRightsizingValue(p95*headroom, resKind) +} + +// quantityToFloat converts a K8s Quantity to a float in the same units as +// Prom values (CPU = cores, memory = bytes). +func quantityToFloat(q resource.Quantity, resKind string) float64 { + switch resKind { + case "cpu": + // MilliValue / 1000 gives cores as float — handles "100m" / "1" / "1.5" uniformly. + return float64(q.MilliValue()) / 1000.0 + case "memory": + return float64(q.Value()) + } + return 0 +} + +// formatRightsizingValue formats a Prom-shaped value (cores or bytes) into the +// human-friendly form that maps back to spec.resources strings. +func formatRightsizingValue(v float64, resKind string) string { + switch resKind { + case "cpu": + if v < 0.001 { + return "1m" + } + // Round to the nearest 10m to avoid noisy recommendations like 137m. + millis := max(int64(v*1000.0+5)/10*10, 10) + if millis < 1000 { + return fmt.Sprintf("%dm", millis) + } + cores := float64(millis) / 1000.0 + // Trim trailing .0 + if cores == float64(int64(cores)) { + return fmt.Sprintf("%d", int64(cores)) + } + return fmt.Sprintf("%.1f", cores) + case "memory": + const Mi = 1024 * 1024 + const Gi = 1024 * Mi + if v >= float64(Gi) { + return fmt.Sprintf("%.1fGi", v/float64(Gi)) + } + // Round up to next 16Mi to give a clean recommendation. + mib := max(int64(v/float64(Mi)+15)/16*16, 16) + return fmt.Sprintf("%dMi", mib) + } + return "" +} + +// PVCUsageResponse is returned by the PVC usage endpoint. +type PVCUsageResponse struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + Used int64 `json:"used"` // bytes + Capacity int64 `json:"capacity"` // bytes + Ratio float64 `json:"ratio"` // 0.0 - 1.0 + HasData bool `json:"hasData"` // false when no series (CSI not reporting, kubelet not scraped, etc.) +} + +// handlePVCUsage returns current usage for a PVC, computed from +// kubelet_volume_stats_{used,capacity}_bytes. Returns HasData=false silently +// when no series — many CSI drivers don't implement NodeGetVolumeStats and +// some Prom configs (notably GMP default) don't scrape kubelet endpoints. +func handlePVCUsage(w http.ResponseWriter, r *http.Request) { + client := GetClient() + if client == nil { + writeError(w, http.StatusServiceUnavailable, "Prometheus client not initialized") + return + } + + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + + if !canRead(r, "", "persistentvolumeclaims", namespace, "get") { + writeError(w, http.StatusForbidden, "forbidden") + return + } + + ns := SanitizeLabelValue(namespace) + pvc := SanitizeLabelValue(name) + + // kubelet's native label is `persistentvolumeclaim`; clusters with custom + // relabeling that renamed it will return no series and the gauge hides. + usedQuery := fmt.Sprintf(`max(kubelet_volume_stats_used_bytes{namespace='%s',persistentvolumeclaim='%s'})`, ns, pvc) + capQuery := fmt.Sprintf(`max(kubelet_volume_stats_capacity_bytes{namespace='%s',persistentvolumeclaim='%s'})`, ns, pvc) + + resp := PVCUsageResponse{Namespace: namespace, Name: name} + + usedRes, err := client.Query(r.Context(), usedQuery) + if err != nil { + // Distinguish "Prometheus is unreachable" from "CSI doesn't report" so + // operators can find this in the errorlog stream when the gauge mysteriously + // disappears. The frontend still hides on hasData=false. + errorlog.Record("prometheus", "warning", "pvc used-bytes query failed for %s/%s: %v", namespace, name, err) + writeJSON(w, http.StatusOK, resp) + return + } + capRes, err := client.Query(r.Context(), capQuery) + if err != nil { + errorlog.Record("prometheus", "warning", "pvc capacity-bytes query failed for %s/%s: %v", namespace, name, err) + writeJSON(w, http.StatusOK, resp) + return + } + + used := firstValue(usedRes) + capacity := firstValue(capRes) + if used == nil || capacity == nil || *capacity <= 0 { + writeJSON(w, http.StatusOK, resp) + return + } + + resp.Used = int64(*used) + resp.Capacity = int64(*capacity) + resp.Ratio = *used / *capacity + resp.HasData = true + writeJSON(w, http.StatusOK, resp) +} + +func firstValue(res *QueryResult) *float64 { + if res == nil || len(res.Series) == 0 || len(res.Series[0].DataPoints) == 0 { + return nil + } + v := res.Series[0].DataPoints[0].Value + if v != v { + return nil + } + return &v +} diff --git a/internal/prometheus/rightsizing_test.go b/internal/prometheus/rightsizing_test.go new file mode 100644 index 000000000..ddee299b1 --- /dev/null +++ b/internal/prometheus/rightsizing_test.go @@ -0,0 +1,208 @@ +package prometheus + +import ( + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func mustQuantity(t *testing.T, s string) *resource.Quantity { + t.Helper() + q := resource.MustParse(s) + return &q +} + +func TestClassifyRightsizing(t *testing.T) { + q := func(s string) *resource.Quantity { return mustQuantity(t, s) } + + tests := []struct { + name string + p95 float64 + req, lim *resource.Quantity + resKind string + wantTone Tone + wantMsg string + wantRec bool + }{ + // Memory OOM rule — the only path that fires critical. + {"memory p95 at 95% of limit fires critical", 0.95 * 256 * 1024 * 1024, q("128Mi"), q("256Mi"), "memory", ToneCritical, "OOM risk", true}, + // p95 well below limit AND below request → falls into ratio branch. + {"memory p95 below 95% of limit does not fire critical", 100 * 1024 * 1024, q("256Mi"), q("512Mi"), "memory", ToneOK, "", false}, + {"memory p95 above limit still critical", 300 * 1024 * 1024, q("128Mi"), q("256Mi"), "memory", ToneCritical, "OOM risk", true}, + + // CPU throttle rule — strict greater than limit. + {"cpu p95 above limit fires alert", 1.001, q("100m"), q("1"), "cpu", ToneAlert, "throttling", true}, + // p95 == limit → strict > fails, falls into ratio branch. ratio=1.0 → well-sized. + {"cpu p95 exactly at limit does not fire alert", 1.0, q("1"), q("1"), "cpu", ToneOK, "Well-sized", false}, + + // No request set → warning, not critical. + {"no cpu request", 0.5, nil, q("1"), "cpu", ToneWarning, "No cpu request", true}, + {"no memory request", 200 * 1024 * 1024, nil, q("1Gi"), "memory", ToneWarning, "No memory request", true}, + + // P95 exceeds request but within limit → warning. + {"cpu p95 exceeds request", 0.15, q("100m"), q("1"), "cpu", ToneWarning, "exceeds request", true}, + + // "Well-sized" thresholds — the user-facing nag policy. Boundary at 3×. + {"ratio 1.0 well-sized", 0.1, q("100m"), q("500m"), "cpu", ToneOK, "Well-sized", false}, + {"ratio 3.0 well-sized", 0.1, q("300m"), q("1"), "cpu", ToneOK, "Well-sized", false}, + {"ratio just over 3.0 shows headroom", 0.0999, q("300m"), q("1"), "cpu", ToneOK, "headroom", false}, + + // CPU over-provisioned threshold = 8× (strict greater). + {"cpu ratio 8.0 shows headroom only", 0.1, q("800m"), q("2"), "cpu", ToneOK, "headroom", false}, + {"cpu ratio above 8 surfaces as info", 0.0999, q("800m"), q("2"), "cpu", ToneInfo, "Over-provisioned", true}, + + // Memory over-provisioned threshold = 5× (strict greater). + {"memory ratio 5.0 shows headroom only", 50 * 1024 * 1024, q("250Mi"), q("1Gi"), "memory", ToneOK, "headroom", false}, + {"memory ratio above 5 surfaces as info", 49.9 * 1024 * 1024, q("250Mi"), q("1Gi"), "memory", ToneInfo, "Over-provisioned", true}, + + // Defensive: zero request short-circuits without crashing. + {"zero cpu request", 0.5, q("0"), nil, "cpu", ToneOK, "", false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + row := RightsizingRow{Tone: ToneOK} + classifyRightsizing(&row, tc.p95, tc.req, tc.lim, tc.resKind) + if row.Tone != tc.wantTone { + t.Errorf("tone = %s, want %s (msg=%q)", row.Tone, tc.wantTone, row.Message) + } + if tc.wantMsg != "" && !strings.Contains(row.Message, tc.wantMsg) { + t.Errorf("message = %q, want substring %q", row.Message, tc.wantMsg) + } + if tc.wantRec && row.RecommendedReq == nil { + t.Errorf("expected RecommendedReq populated, got nil") + } + if !tc.wantRec && row.RecommendedReq != nil { + t.Errorf("expected no RecommendedReq, got %q", *row.RecommendedReq) + } + }) + } +} + +func TestRecommendRequest(t *testing.T) { + tests := []struct { + name string + p95 float64 + resKind string + want string + }{ + // CPU — 15% headroom, round to a clean 10m step, floor at 10m. Exact + // step depends on float repr (1.15× a non-representable value can + // round down by one step), but the result is always a clean 10m + // boundary — the "no noisy 137m recommendations" promise. + {"cpu sub-milli rounds to 1m", 0.0001, "cpu", "1m"}, + {"cpu 100m → ~115m → clean 110m step", 0.100, "cpu", "110m"}, + {"cpu 1 core → ~1150m → 1.1 cores (round-half-to-even)", 1.0, "cpu", "1.1"}, + {"cpu integer cores trim trailing zero", 0.870, "cpu", "1"}, + {"cpu floor at 10m", 0.001, "cpu", "10m"}, + + // Memory — 10% headroom, round up to next 16Mi, floor at 16Mi. + {"memory tiny floors at 16Mi", 1024, "memory", "16Mi"}, + {"memory 100Mi → 110Mi → next 16Mi step", 100 * 1024 * 1024, "memory", "112Mi"}, + {"memory 1Gi exact boundary", 1024 * 1024 * 1024, "memory", "1.1Gi"}, + {"memory just under 1Gi shows Mi", 900 * 1024 * 1024, "memory", "992Mi"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := recommendRequest(tc.p95, tc.resKind) + if got != tc.want { + t.Errorf("recommendRequest(%g, %q) = %q, want %q", tc.p95, tc.resKind, got, tc.want) + } + }) + } +} + +func TestExtractRuntimeContainers(t *testing.T) { + always := corev1.ContainerRestartPolicyAlways + onFailure := corev1.ContainerRestartPolicy("OnFailure") + + tests := []struct { + name string + spec *corev1.PodSpec + wantNames []string + }{ + {"regular containers only", &corev1.PodSpec{ + Containers: []corev1.Container{{Name: "app"}, {Name: "proxy"}}, + }, []string{"app", "proxy"}}, + + {"pure init excluded", &corev1.PodSpec{ + Containers: []corev1.Container{{Name: "app"}}, + InitContainers: []corev1.Container{{Name: "migrate"}}, + }, []string{"app"}}, + + // Load-bearing native-sidecar behavior — without this the request/limit + // overlay misses the sidecar's contribution. + {"native sidecar included", &corev1.PodSpec{ + Containers: []corev1.Container{{Name: "app"}}, + InitContainers: []corev1.Container{{Name: "envoy", RestartPolicy: &always}}, + }, []string{"app", "envoy"}}, + + {"non-Always init excluded even with restart policy set", &corev1.PodSpec{ + Containers: []corev1.Container{{Name: "app"}}, + InitContainers: []corev1.Container{{Name: "boot", RestartPolicy: &onFailure}}, + }, []string{"app"}}, + + {"init-only pod returns empty runtime", &corev1.PodSpec{ + InitContainers: []corev1.Container{{Name: "job"}}, + }, []string{}}, + + {"regular + sidecar + pure init mix", &corev1.PodSpec{ + Containers: []corev1.Container{{Name: "app"}}, + InitContainers: []corev1.Container{ + {Name: "wait-db"}, + {Name: "envoy", RestartPolicy: &always}, + }, + }, []string{"app", "envoy"}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := extractRuntimeContainers(tc.spec) + gotNames := make([]string, len(got)) + for i, c := range got { + gotNames[i] = c.name + } + if !slicesEqual(gotNames, tc.wantNames) { + t.Errorf("names = %v, want %v", gotNames, tc.wantNames) + } + }) + } +} + +func TestFormatRightsizingValue(t *testing.T) { + tests := []struct { + v float64 + resKind string + want string + }{ + {0.0005, "cpu", "1m"}, + {2.0, "cpu", "2"}, + {1.5, "cpu", "1.5"}, + {1024, "memory", "16Mi"}, + {0, "memory", "16Mi"}, + {float64(2 * 1024 * 1024 * 1024), "memory", "2.0Gi"}, + {1.0, "disk", ""}, + } + for _, tc := range tests { + t.Run(tc.want, func(t *testing.T) { + got := formatRightsizingValue(tc.v, tc.resKind) + if got != tc.want { + t.Errorf("formatRightsizingValue(%g, %q) = %q, want %q", tc.v, tc.resKind, got, tc.want) + } + }) + } +} + +func slicesEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/internal/server/server.go b/internal/server/server.go index ddd6a8c67..410b2d9a4 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -373,7 +373,13 @@ func (s *Server) setupRoutes() { imageHandlers := images.NewHandlers() imageHandlers.RegisterRoutes(r) - // Prometheus metrics routes + // Prometheus metrics routes. The auth gate is required for endpoints + // that read K8s spec data via the shared informer cache (rightsizing, + // PVC usage) — the cache is populated under Radar's SA, so without + // it any authenticated user could fetch any namespace's spec. + prometheuspkg.SetAuthGate(func(req *http.Request, group, resource, namespace, verb string) bool { + return s.canRead(req, group, resource, namespace, verb) + }) prometheuspkg.RegisterRoutes(r) // OpenCost routes diff --git a/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx index 899ff5705..d10b60c80 100644 --- a/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx +++ b/packages/k8s-ui/src/components/resources/renderers/HPARenderer.tsx @@ -1,3 +1,4 @@ +import type { ReactNode } from 'react' import { Cpu, AlertTriangle } from 'lucide-react' import { clsx } from 'clsx' import { Section, PropertyList, Property, ConditionsSection, ResourceLink } from '../../ui/drawer-components' @@ -7,6 +8,8 @@ import { formatAge } from '../resource-utils' interface HPARendererProps { data: any onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void + /** Optional host-provided section rendered after Conditions — used to inject Prometheus-backed charts. */ + extraSections?: ReactNode } // Extract problems from HPA conditions @@ -53,7 +56,7 @@ function getHPAProblems(data: any): string[] { return problems } -export function HPARenderer({ data, onNavigate }: HPARendererProps) { +export function HPARenderer({ data, onNavigate, extraSections }: HPARendererProps) { const status = data.status || {} const spec = data.spec || {} const metrics = status.currentMetrics || [] @@ -160,6 +163,8 @@ export function HPARenderer({ data, onNavigate }: HPARendererProps) { )} + + {extraSections} ) } diff --git a/packages/k8s-ui/src/components/resources/renderers/PVCRenderer.tsx b/packages/k8s-ui/src/components/resources/renderers/PVCRenderer.tsx index 68bfa6fbe..cc77b7bb1 100644 --- a/packages/k8s-ui/src/components/resources/renderers/PVCRenderer.tsx +++ b/packages/k8s-ui/src/components/resources/renderers/PVCRenderer.tsx @@ -1,3 +1,4 @@ +import type { ReactNode } from 'react' import { HardDrive } from 'lucide-react' import { clsx } from 'clsx' import { Section, PropertyList, Property, ConditionsSection, AlertBanner, ResourceLink } from '../../ui/drawer-components' @@ -5,6 +6,8 @@ import { Section, PropertyList, Property, ConditionsSection, AlertBanner, Resour interface PVCRendererProps { data: any onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void + /** Optional host-provided section, used for a Prometheus-derived usage gauge. */ + extraSections?: ReactNode } const accessModeShorthand: Record = { @@ -19,7 +22,7 @@ function formatAccessModes(modes: string[] | undefined): string | undefined { return modes.map(m => accessModeShorthand[m] || m).join(', ') } -export function PVCRenderer({ data, onNavigate }: PVCRendererProps) { +export function PVCRenderer({ data, onNavigate, extraSections }: PVCRendererProps) { const status = data.status || {} const spec = data.spec || {} const annotations = data.metadata?.annotations || {} @@ -92,6 +95,8 @@ export function PVCRenderer({ data, onNavigate }: PVCRendererProps) { )} + {extraSections} + ) diff --git a/packages/k8s-ui/src/components/shared/ResourceRendererDispatch.tsx b/packages/k8s-ui/src/components/shared/ResourceRendererDispatch.tsx index d35df8adf..ce6da3cc8 100644 --- a/packages/k8s-ui/src/components/shared/ResourceRendererDispatch.tsx +++ b/packages/k8s-ui/src/components/shared/ResourceRendererDispatch.tsx @@ -279,6 +279,18 @@ export interface RendererOverrides { data: any onNavigate?: (ref: ResourceRef) => void }> + // HPA: host wraps the base renderer to add Prometheus-backed replicas / + // metric charts below the static spec data. + HPARenderer?: React.ComponentType<{ + data: any + onNavigate?: (ref: ResourceRef) => void + }> + // PVC: host wraps the base renderer to add a kubelet-derived usage gauge + // when Prometheus is scraping kubelet endpoints. + PVCRenderer?: React.ComponentType<{ + data: any + onNavigate?: (ref: ResourceRef) => void + }> } // Known resource types with specific renderers (module-level to avoid re-allocation) @@ -450,6 +462,8 @@ export function ResourceRendererDispatch({ const RoleComp = rendererOverrides?.RoleRenderer ?? RoleRenderer const RoleBindingComp = rendererOverrides?.RoleBindingRenderer ?? RoleBindingRenderer const NamespaceComp = rendererOverrides?.NamespaceRenderer ?? NamespaceRenderer + const HPAComp = rendererOverrides?.HPARenderer ?? HPARenderer + const PVCComp = rendererOverrides?.PVCRenderer ?? PVCRenderer const sidebarContent = showCommonSections && ( <> @@ -474,9 +488,9 @@ export function ResourceRendererDispatch({ {kind === 'secrets' && } {kind === 'jobs' && } {kind === 'cronjobs' && } - {(kind === 'hpas' || kind === 'horizontalpodautoscalers') && } + {(kind === 'hpas' || kind === 'horizontalpodautoscalers') && } {kind === 'nodes' && } - {kind === 'persistentvolumeclaims' && } + {kind === 'persistentvolumeclaims' && } {kind === 'rollouts' && } {kind === 'certificates' && !data?.apiVersion?.includes('networking.internal.knative.dev') && } {kind === 'workflows' && } diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 4d074a98c..df1b17d49 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -1246,9 +1246,44 @@ export interface PrometheusResourceMetrics { hint?: string // Contextual hint when results are empty (e.g. cri-docker label issues) } -export type PrometheusMetricCategory = 'cpu' | 'memory' | 'network_rx' | 'network_tx' | 'filesystem' +export type PrometheusMetricCategory = 'cpu' | 'memory' | 'network_rx' | 'network_tx' | 'filesystem' | 'restarts' export type PrometheusTimeRange = '10m' | '30m' | '1h' | '3h' | '6h' | '12h' | '24h' | '48h' | '7d' | '14d' +// PVC usage at a moment in time, derived from kubelet_volume_stats_*. +// HasData=false silently indicates the CSI driver doesn't report or Prom +// isn't scraping kubelet endpoints — UI should hide the gauge in that case. +export interface PrometheusPVCUsage { + namespace: string + name: string + used: number + capacity: number + ratio: number + hasData: boolean +} + +export type RightsizingTone = 'ok' | 'info' | 'warning' | 'alert' | 'critical' + +export interface RightsizingRow { + container: string + resource: 'cpu' | 'memory' + currentRequest?: string + currentLimit?: string + p95?: string + recommendedRequest?: string + tone: RightsizingTone + message: string +} + +export interface PrometheusRightsizing { + kind: string + namespace: string + name: string + window: string + sampleAvailable: boolean + rows: RightsizingRow[] + reason?: string +} + // Check Prometheus availability export function usePrometheusStatus() { return useQuery({ @@ -1337,6 +1372,40 @@ export function usePrometheusClusterMetrics( }) } +// Fetch PVC usage. hasData=false when no series — UI should hide the gauge. +export function usePrometheusPVCUsage(namespace: string, name: string, enabled = true) { + return useQuery({ + queryKey: ['prometheus-pvc-usage', namespace, name], + queryFn: () => fetchJSON(`/prometheus/pvc/${namespace}/${name}`), + enabled: enabled && Boolean(namespace && name), + staleTime: 60000, + refetchInterval: 120000, + }) +} + +// Fetch rightsizing recommendations for a workload (Deployment / StatefulSet / DaemonSet). +export function usePrometheusRightsizing(kind: string, namespace: string, name: string, enabled = true) { + return useQuery({ + queryKey: ['prometheus-rightsizing', kind, namespace, name], + queryFn: () => fetchJSON(`/prometheus/rightsizing/${kind}/${namespace}/${name}`), + enabled: enabled && Boolean(kind && namespace && name), + staleTime: 5 * 60 * 1000, // P95 over 24h is slow to shift; cache aggressively + refetchInterval: 10 * 60 * 1000, + }) +} + +// Raw PromQL query (range). Used by HPA charts for status_current_replicas etc. +export function usePromQLRange(query: string, range: PrometheusTimeRange = '1h', enabled = true) { + return useQuery({ + queryKey: ['promql-range', query, range], + queryFn: () => + fetchJSON(`/prometheus/query?query=${encodeURIComponent(query)}&range=${range}`), + enabled: enabled && Boolean(query), + staleTime: 30000, + refetchInterval: 60000, + }) +} + // ============================================================================ // Pod Logs // ============================================================================ diff --git a/web/src/components/resource/HPACharts.tsx b/web/src/components/resource/HPACharts.tsx new file mode 100644 index 000000000..151bb5c0e --- /dev/null +++ b/web/src/components/resource/HPACharts.tsx @@ -0,0 +1,228 @@ +import { useMemo } from 'react' +import { LineChart } from 'lucide-react' +import { usePromQLRange, usePrometheusStatus, type PrometheusSeries } from '../../api/client' + +/** + * HPACharts — replicas-over-time chart for an HPA. + * + * Sources from KSM `kube_horizontalpodautoscaler_status_{current,desired}_replicas`. + * Hidden silently when Prom isn't connected or KSM isn't reporting the series. + * + * v0 deliberately ships only the replicas chart. The "observed CPU vs target" + * chart that mixin dashboards show requires deriving utilization from cAdvisor + * (since KSM exposes the spec target but not the observed metric), which adds + * complexity that doesn't change the primary question — "did the HPA actually + * scale during this spike?". + */ +export function HPACharts({ data }: { data: any }) { + const { data: status } = usePrometheusStatus() + const isConnected = status?.connected === true + + const namespace = data?.metadata?.namespace ?? '' + const name = data?.metadata?.name ?? '' + const spec = data?.spec ?? {} + const min = spec.minReplicas ?? 1 + const max = spec.maxReplicas + + const currentQuery = useMemo( + () => `kube_horizontalpodautoscaler_status_current_replicas{namespace="${escapeLabel(namespace)}",horizontalpodautoscaler="${escapeLabel(name)}"}`, + [namespace, name], + ) + const desiredQuery = useMemo( + () => `kube_horizontalpodautoscaler_status_desired_replicas{namespace="${escapeLabel(namespace)}",horizontalpodautoscaler="${escapeLabel(name)}"}`, + [namespace, name], + ) + + const enabled = isConnected && Boolean(namespace && name) + const { data: currentRes, error: currentErr } = usePromQLRange(currentQuery, '1h', enabled) + const { data: desiredRes, error: desiredErr } = usePromQLRange(desiredQuery, '1h', enabled) + + const replicasPoints = useMemo(() => combineSeries({ + current: currentRes?.series, + desired: desiredRes?.series, + }), [currentRes, desiredRes]) + + // Surface Prom-side failures in the console so an operator debugging a + // missing HPA chart has a breadcrumb; the chart still hides silently when + // KSM isn't reporting (the common no-data case). + if (currentErr || desiredErr) { + console.warn('[HPACharts] PromQL query failed', { currentErr, desiredErr }) + } + + if (!isConnected) return null + if (!replicasPoints) return null + + return ( +
+
+ + Activity (last 1h) +
+ + v.toFixed(0)} + /> +
+ ) +} + +// ============================================================================ +// Internals +// ============================================================================ + +interface FlatPoint { timestamp: number; value: number } + +function extractFirstSeries(series: PrometheusSeries[]): FlatPoint[] | null { + for (const s of series) { + if (s.dataPoints.length > 0) { + return s.dataPoints.map(dp => ({ timestamp: dp.timestamp, value: dp.value })) + } + } + return null +} + +function combineSeries(args: { current?: PrometheusSeries[]; desired?: PrometheusSeries[] }): { + current: FlatPoint[] + desired: FlatPoint[] +} | null { + const current = args.current ? extractFirstSeries(args.current) : null + const desired = args.desired ? extractFirstSeries(args.desired) : null + if (!current && !desired) return null + return { + current: current ?? [], + desired: desired ?? [], + } +} + +function escapeLabel(s: string): string { + return s.replace(/[\\"]/g, '\\$&') +} + +// ============================================================================ +// DualLineChart — minimal two-line chart for HPA-style time series. +// Deliberately separate from PrometheusCharts.AreaChart: the chart shapes are +// different (line not area, discrete integer Y axis for replicas), and +// reusing the area chart would require adding more knobs to it. +// ============================================================================ + +interface LineSpec { + label: string + points: FlatPoint[] + color: string + dashed?: boolean +} + +interface RefLine { value: number; label: string; color: string } + +function DualLineChart({ title, height, primary, secondary, referenceLines, formatY }: { + title: string + height: number + primary: LineSpec + secondary?: LineSpec + referenceLines?: RefLine[] + formatY: (v: number) => string +}) { + const allPoints = [...primary.points, ...(secondary?.points ?? [])] + if (allPoints.length === 0) { + return ( +
{title} — no data
+ ) + } + + const minTs = Math.min(...allPoints.map(p => p.timestamp)) + const maxTs = Math.max(...allPoints.map(p => p.timestamp)) + const tsSpan = Math.max(maxTs - minTs, 60) + + let maxV = Math.max(...allPoints.map(p => p.value), 1) + if (referenceLines) { + for (const rl of referenceLines) maxV = Math.max(maxV, rl.value) + } + // Add 10% headroom so the top line isn't flush with the top edge. + maxV = maxV * 1.1 + + const width = 600 + const marginL = 36 + const marginR = 16 + const marginT = 4 + const marginB = 18 + const plotW = width - marginL - marginR + const plotH = height - marginT - marginB + + const toX = (ts: number) => marginL + ((ts - minTs) / tsSpan) * plotW + const toY = (v: number) => marginT + plotH - (v / maxV) * plotH + + const drawLine = (spec: LineSpec) => { + if (spec.points.length === 0) return null + const d = spec.points.map((p, i) => `${i === 0 ? 'M' : 'L'}${toX(p.timestamp).toFixed(1)},${toY(p.value).toFixed(1)}`).join(' ') + return ( + + ) + } + + return ( +
+
+ {title} +
+ + {secondary && } +
+
+ + {/* Y ticks */} + {[0, 0.5, 1].map(frac => { + const v = maxV * frac + const y = toY(v) + return ( + + + + {formatY(v)} + + + ) + })} + {/* Reference lines */} + {referenceLines?.map((rl, i) => { + const y = toY(rl.value) + return ( + + + + {rl.label} + + + ) + })} + {drawLine(primary)} + {secondary && drawLine(secondary)} + +
+ ) +} + +function Legend({ color, label, dashed }: { color: string; label: string; dashed?: boolean }) { + return ( + + + + + {label} + + ) +} diff --git a/web/src/components/resource/PVCUsageBar.tsx b/web/src/components/resource/PVCUsageBar.tsx new file mode 100644 index 000000000..86ec7dcaf --- /dev/null +++ b/web/src/components/resource/PVCUsageBar.tsx @@ -0,0 +1,62 @@ +import { usePrometheusPVCUsage, usePrometheusStatus } from '../../api/client' + +/** + * PVCUsageBar — single-line capacity gauge derived from kubelet_volume_stats_*. + * + * Hidden silently when: + * - Prometheus isn't connected + * - The CSI driver doesn't implement NodeGetVolumeStats + * - Prometheus isn't scraping kubelet endpoints (notably GMP default config) + * + * Operators get nothing rather than a "no data" message that'd look like Radar + * is broken — the absence is information enough. + */ +export function PVCUsageBar({ namespace, name }: { namespace: string; name: string }) { + const { data: status } = usePrometheusStatus() + const isConnected = status?.connected === true + const { data: usage } = usePrometheusPVCUsage(namespace, name, isConnected) + + if (!usage || !usage.hasData) return null + + const pct = Math.max(0, Math.min(1, usage.ratio)) + const usedLabel = formatBytes(usage.used) + const capLabel = formatBytes(usage.capacity) + const pctLabel = `${(pct * 100).toFixed(0)}%` + + // Tone: green well under, amber > 75%, red > 90%. PVCs fill silently — the + // top tone is justified because the consequence (write failures) is severe. + const tone = pct >= 0.9 ? 'critical' : pct >= 0.75 ? 'warning' : 'ok' + const barColor = + tone === 'critical' ? 'bg-red-500' : + tone === 'warning' ? 'bg-amber-500' : + 'bg-emerald-500' + // Light/dark-paired text tones — `text-red-400` alone washes out in light + // mode (Tailwind's 400 stop is calibrated for dark backgrounds). + const textColor = + tone === 'critical' ? 'text-red-700 dark:text-red-400' : + tone === 'warning' ? 'text-amber-700 dark:text-amber-400' : + 'text-theme-text-secondary' + + return ( +
+
+ Usage + + {usedLabel} / {capLabel} + ({pctLabel}) + +
+
+
+
+
+ ) +} + +function formatBytes(b: number): string { + if (b < 1024) return `${b} B` + if (b < 1024 ** 2) return `${(b / 1024).toFixed(1)} KiB` + if (b < 1024 ** 3) return `${(b / 1024 ** 2).toFixed(1)} MiB` + if (b < 1024 ** 4) return `${(b / 1024 ** 3).toFixed(2)} GiB` + return `${(b / 1024 ** 4).toFixed(2)} TiB` +} diff --git a/web/src/components/resource/PrometheusCharts.tsx b/web/src/components/resource/PrometheusCharts.tsx index 24328743b..09a8084a3 100644 --- a/web/src/components/resource/PrometheusCharts.tsx +++ b/web/src/components/resource/PrometheusCharts.tsx @@ -76,9 +76,15 @@ interface PrometheusChartsProps { name: string /** When true, show "no data" empty state instead of hiding. Defaults to false (hide when no data). */ showEmptyState?: boolean + /** + * Full K8s resource. When provided, CPU and memory charts overlay the + * aggregate request/limit (summed across runtime containers including + * native sidecars, multiplied by readyReplicas for replicated workloads). + */ + resource?: any } -export function PrometheusCharts({ kind, namespace, name, showEmptyState = false }: PrometheusChartsProps) { +export function PrometheusCharts({ kind, namespace, name, showEmptyState = false, resource }: PrometheusChartsProps) { const { data: status, isLoading: statusLoading } = usePrometheusStatus() const connectMutation = usePrometheusConnect() @@ -144,6 +150,13 @@ export function PrometheusCharts({ kind, namespace, name, showEmptyState = false const activeCategoryDef = categories.find(c => c.key === activeCategory) || categories[0] + // Reference lines: aggregate request/limit overlaid on CPU and memory charts. + // Memory unit is bytes; CPU unit is cores; both match the chart axis. + const referenceLines = useMemo(() => { + if (!resource || (activeCategory !== 'cpu' && activeCategory !== 'memory')) return undefined + return computeRequestLimitLines(resource, kind, activeCategory) + }, [resource, kind, activeCategory]) + return (
{/* Toolbar */} @@ -206,6 +219,7 @@ export function PrometheusCharts({ kind, namespace, name, showEmptyState = false color={activeCategoryDef.chartColor} fillColor={activeCategoryDef.fillColor} unit={metrics.unit} + referenceLines={referenceLines} />
@@ -329,11 +343,19 @@ function computeShortLabels(labels: string[]): string[] { return suffixes } -function AreaChart({ series, color, fillColor, unit }: { +export interface ReferenceLine { + value: number + label: string + /** 'request' is muted gray, 'limit' is amber — neither alarming */ + tone: 'request' | 'limit' +} + +function AreaChart({ series, color, fillColor, unit, referenceLines }: { series: PrometheusSeries[] color: string fillColor: string unit: string + referenceLines?: ReferenceLine[] }) { const svgRef = useRef(null) const [hoverX, setHoverX] = useState(null) @@ -361,11 +383,19 @@ function AreaChart({ series, color, fillColor, unit }: { maxVal = unit === 'cores' ? 0.01 : unit === 'bytes' ? 1024 * 1024 : unit === 'bytes/s' ? 1024 : 1 } + // Extend axis to include reference lines so request/limit aren't clipped at + // the top, which would make the relationship between usage and limit unreadable. + if (referenceLines) { + for (const rl of referenceLines) { + if (rl.value > maxVal) maxVal = rl.value + } + } + const padding = maxVal * 0.1 const yMax = maxVal + padding return { minTs, maxTs, yMax, series } - }, [series, unit]) + }, [series, unit, referenceLines]) if (!chartData) return null @@ -544,6 +574,40 @@ function AreaChart({ series, color, fillColor, unit }: { /> ))} + {/* Reference lines (request / limit overlays) */} + {referenceLines?.map((rl, i) => { + // Clamp y so the line never escapes the plot — the yMax expansion + // above keeps it in bounds normally, but guard against rounding. + const y = Math.max(marginTop, Math.min(marginTop + plotHeight, toY(rl.value))) + const stroke = rl.tone === 'limit' ? '#f59e0b' : '#94a3b8' + // Tone-matched labels on the right edge. + return ( + + + + {rl.label} + + + ) + })} + {/* Hover crosshair + dots */} {hoverData && ( <> @@ -678,6 +742,124 @@ function formatTimestamp(unix: number): string { return d.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' }) } +// ============================================================================ +// Request/limit overlay derivation +// ============================================================================ + +/** + * Compute aggregate request + limit reference lines from a K8s resource spec. + * Sums across runtime containers (regular + native sidecars), excluding pure + * init containers. The values are per-pod — workload charts use + * `sum(...) by (pod, namespace)` (one series per pod, at per-pod scale), so + * the reference line lives on the same axis without any replica multiplier. + * + * Returns undefined when the spec doesn't have enough information to render + * a meaningful line (no runtime containers, or no values set on any container). + */ +function computeRequestLimitLines( + resource: any, + kind: string, + category: 'cpu' | 'memory', +): ReferenceLine[] | undefined { + if (!resource) return undefined + const podSpec = extractPodSpec(resource, kind) + if (!podSpec) return undefined + + const runtimeContainers = collectRuntimeContainers(podSpec) + if (runtimeContainers.length === 0) return undefined + + let reqSum = 0, reqAny = false + let limSum = 0, limAny = false + for (const c of runtimeContainers) { + const req = readQuantity(c.resources?.requests?.[category], category) + const lim = readQuantity(c.resources?.limits?.[category], category) + if (req != null) { reqSum += req; reqAny = true } + if (lim != null) { limSum += lim; limAny = true } + } + + const lines: ReferenceLine[] = [] + if (reqAny) { + lines.push({ + value: reqSum, + label: `request ${formatRequestLimitLabel(reqSum, category)}`, + tone: 'request', + }) + } + if (limAny) { + lines.push({ + value: limSum, + label: `limit ${formatRequestLimitLabel(limSum, category)}`, + tone: 'limit', + }) + } + return lines.length > 0 ? lines : undefined +} + +function extractPodSpec(resource: any, kind: string): any | undefined { + if (kind === 'Pod') return resource?.spec + if (kind === 'CronJob') return resource?.spec?.jobTemplate?.spec?.template?.spec + return resource?.spec?.template?.spec +} + +function collectRuntimeContainers(podSpec: any): any[] { + const out: any[] = [] + for (const c of (podSpec?.containers || [])) out.push(c) + // Native sidecars (initContainers with restartPolicy: Always, GA in 1.33) + // run for the pod's lifetime and contribute to steady-state usage. Pure + // init containers run to completion and don't. + for (const c of (podSpec?.initContainers || [])) { + if (c?.restartPolicy === 'Always') out.push(c) + } + return out +} + +const CPU_SUFFIXES: Record = { n: 1e-9, u: 1e-6, m: 1e-3 } +const MEMORY_SUFFIXES: Record = { + Ki: 1024, Mi: 1024 ** 2, Gi: 1024 ** 3, Ti: 1024 ** 4, Pi: 1024 ** 5, Ei: 1024 ** 6, + K: 1e3, M: 1e6, G: 1e9, T: 1e12, P: 1e15, E: 1e18, +} + +function readQuantity(raw: unknown, category: 'cpu' | 'memory'): number | null { + if (raw == null) return null + const s = String(raw).trim() + if (s === '') return null + // Strip suffix and parse. Each branch must guard against NaN — otherwise + // garbage like "abcMi" returns NaN and poisons the caller's running sum, + // which silently produces a missing/zeroed reference line on the chart. + if (category === 'cpu') { + if (s.endsWith('m')) return scaleOrNull(s, CPU_SUFFIXES.m) + if (s.endsWith('n')) return scaleOrNull(s, CPU_SUFFIXES.n) + if (s.endsWith('u')) return scaleOrNull(s, CPU_SUFFIXES.u) + const v = parseFloat(s) + return isNaN(v) ? null : v + } + // Memory: try two-character then one-character suffixes (Mi before M). + for (const suffix of ['Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei']) { + if (s.endsWith(suffix)) return scaleOrNull(s, MEMORY_SUFFIXES[suffix]) + } + for (const suffix of ['K', 'M', 'G', 'T', 'P', 'E']) { + if (s.endsWith(suffix)) return scaleOrNull(s, MEMORY_SUFFIXES[suffix]) + } + const v = parseFloat(s) + return isNaN(v) ? null : v +} + +function scaleOrNull(s: string, scale: number): number | null { + const v = parseFloat(s) + return isNaN(v) ? null : v * scale +} + +function formatRequestLimitLabel(value: number, category: 'cpu' | 'memory'): string { + if (category === 'cpu') { + if (value < 1) return `${Math.round(value * 1000)}m` + return value.toFixed(2).replace(/\.?0+$/, '') + } + // Memory — match formatMetricValue's tier breakpoints. + if (value < 1024 * 1024) return `${(value / 1024).toFixed(0)}KiB` + if (value < 1024 ** 3) return `${(value / (1024 ** 2)).toFixed(0)}MiB` + return `${(value / (1024 ** 3)).toFixed(1)}GiB` +} + // ============================================================================ // Export helper to check if a kind is supported // ============================================================================ diff --git a/web/src/components/resource/RestartChart.tsx b/web/src/components/resource/RestartChart.tsx new file mode 100644 index 000000000..1d7dcd0e4 --- /dev/null +++ b/web/src/components/resource/RestartChart.tsx @@ -0,0 +1,110 @@ +import { useMemo } from 'react' +import { AlertCircle } from 'lucide-react' +import { usePrometheusResourceMetrics, usePrometheusStatus, type PrometheusSeries } from '../../api/client' + +/** + * RestartEventLane — vertical markers at each restart event, on a dedicated + * row below the chart. Markers stay readable when they cluster because they + * don't overlay the chart waveform. KSM-gated (uses kube_pod_container_status_restarts_total) + * — silently hidden when Prom isn't connected or the series doesn't exist. + */ +export function RestartEventLane({ kind, namespace, name, range = '1h' }: { + kind: string + namespace: string + name: string + range?: '1h' | '6h' | '24h' | '7d' +}) { + const { data: status } = usePrometheusStatus() + const isConnected = status?.connected === true + const { data: metrics, isLoading, error } = usePrometheusResourceMetrics(kind, namespace, name, 'restarts', range, isConnected) + + const restarts = useMemo(() => collectRestartEvents(metrics?.result?.series), [metrics]) + + // A real Prom-side failure shouldn't look identical to "no restarts" — log + // it so an operator investigating a missing lane has a breadcrumb. The lane + // still hides because we don't want a permanent red banner on every pod. + if (error) { + console.warn('[RestartEventLane] restart query failed', error) + } + + if (!isConnected || isLoading) return null + if (restarts.length === 0) return null + + const minTs = Math.min(...restarts.map(r => r.timestamp)) + const maxTs = Math.max(...restarts.map(r => r.timestamp)) + // Avoid divide-by-zero when there's a single event. + const span = Math.max(maxTs - minTs, 60) + + return ( +
+
+ + + Restarts in last {range} + + + {restarts.reduce((n, r) => n + r.value, 0)} total + +
+
+ {/* Baseline */} +
+ {/* Markers */} + {restarts.map((r, i) => { + const left = `${((r.timestamp - minTs) / span) * 100}%` + return ( +
1 ? ` ×${r.value}` : ''}`} + > +
+
+ ) + })} +
+
+ ) +} + +// ============================================================================ +// Internals +// ============================================================================ + +interface RestartEvent { + timestamp: number + value: number + label: string +} + +function collectRestartEvents(series: PrometheusSeries[] | undefined): RestartEvent[] { + if (!series) return [] + const events: RestartEvent[] = [] + for (const s of series) { + const pod = s.labels.pod ?? 'pod' + // `changes(...[1h])` produces a rolling count, so a single restart shows + // value=1 for ~60 consecutive samples (the whole 1h window). Emit a marker + // only when the count increases — that's when a *new* restart entered the + // window — and use the increase as the marker's restart count. + let prev: number | null = null + for (const dp of s.dataPoints) { + if (prev === null) { + // First sample. If nonzero, the restart happened just before our + // window started but is still recent enough to flag; record one + // marker rather than fabricate a count. + if (dp.value > 0) { + events.push({ timestamp: dp.timestamp, value: 1, label: pod }) + } + } else { + const delta = dp.value - prev + if (delta > 0) { + events.push({ timestamp: dp.timestamp, value: delta, label: pod }) + } + } + prev = dp.value + } + } + events.sort((a, b) => a.timestamp - b.timestamp) + return events +} diff --git a/web/src/components/resource/RightsizingStrip.tsx b/web/src/components/resource/RightsizingStrip.tsx new file mode 100644 index 000000000..ae3ebae9a --- /dev/null +++ b/web/src/components/resource/RightsizingStrip.tsx @@ -0,0 +1,138 @@ +import { ArrowRight, Check, Info, AlertTriangle } from 'lucide-react' +import { usePrometheusRightsizing, usePrometheusStatus, type RightsizingTone, type RightsizingRow } from '../../api/client' + +const RIGHTSIZING_KINDS = new Set(['Deployment', 'StatefulSet', 'DaemonSet']) + +/** + * RightsizingStrip — compact "current → recommended" table per container. + * + * Tone policy is deliberately mild: + * - "Well-sized" or "Nx headroom" → neutral, no badge + * - 5×+ over-provisioning → info ("could reduce"), not a problem + * - P95 exceeds request but within limit → warning + * - P95 exceeds CPU limit → alert (throttling) + * - Memory P95 near limit → critical (active OOM risk) + * + * Anything below severe over-provisioning displays without flagging it as + * an issue. 2-3× headroom is the common, sensible default and should not nag. + */ +export function RightsizingStrip({ kind, namespace, name }: { + kind: string + namespace: string + name: string +}) { + const { data: status } = usePrometheusStatus() + const isConnected = status?.connected === true + const supported = RIGHTSIZING_KINDS.has(kind) + const { data, isLoading } = usePrometheusRightsizing(kind, namespace, name, isConnected && supported) + + if (!supported || !isConnected || isLoading) return null + if (!data) return null + if (!data.sampleAvailable || data.rows.length === 0) return null + + // Group rows by container so each container is a compact two-row block (cpu+mem). + const byContainer = new Map() + for (const row of data.rows) { + if (!byContainer.has(row.container)) byContainer.set(row.container, []) + byContainer.get(row.container)!.push(row) + } + + return ( +
+
+

Right-sizing

+ + based on last {data.window} · P95 + +
+
+ {Array.from(byContainer.entries()).map(([container, rows]) => ( +
+
{container}
+
+ {rows.map(row => ( + + ))} +
+
+ ))} +
+
+ ) +} + +function RightsizingLine({ row }: { row: RightsizingRow }) { + const showRec = row.recommendedRequest && row.recommendedRequest !== row.currentRequest + const toneClass = toneClasses(row.tone) + const Icon = toneIcon(row.tone) + + return ( +
+ + {row.resource} + + + + {row.currentRequest ?? unset} + + + {showRec && ( + <> + + + {row.recommendedRequest} + + + )} + + {row.p95 && ( + + (P95 {row.p95}) + + )} + + {row.tone !== 'ok' && Icon && ( + + + {row.message} + + )} + + {row.tone === 'ok' && row.message && ( + {row.message} + )} +
+ ) +} + +function toneClasses(tone: RightsizingTone): { value: string; badge: string } { + switch (tone) { + case 'critical': + return { value: 'text-red-400', badge: 'text-red-400 bg-red-500/10' } + case 'alert': + return { value: 'text-orange-400', badge: 'text-orange-400 bg-orange-500/10' } + case 'warning': + return { value: 'text-amber-400', badge: 'text-amber-400 bg-amber-500/10' } + case 'info': + // Muted on purpose — "could reduce" is a suggestion, not a problem. + return { value: 'text-blue-300', badge: 'text-theme-text-tertiary bg-theme-elevated/60' } + case 'ok': + default: + return { value: 'text-theme-text-secondary', badge: '' } + } +} + +function toneIcon(tone: RightsizingTone) { + switch (tone) { + case 'critical': + case 'alert': + case 'warning': + return AlertTriangle + case 'info': + return Info + case 'ok': + return Check + default: + return null + } +} diff --git a/web/src/components/resources/renderers/HPARenderer.tsx b/web/src/components/resources/renderers/HPARenderer.tsx index ac6e58db4..8952efbdd 100644 --- a/web/src/components/resources/renderers/HPARenderer.tsx +++ b/web/src/components/resources/renderers/HPARenderer.tsx @@ -1 +1,17 @@ -export * from '@skyhook-io/k8s-ui/components/resources/renderers/HPARenderer' +import { HPARenderer as BaseHPARenderer } from '@skyhook-io/k8s-ui/components/resources/renderers/HPARenderer' +import { HPACharts } from '../../resource/HPACharts' + +interface HPARendererProps { + data: any + onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void +} + +export function HPARenderer({ data, onNavigate }: HPARendererProps) { + return ( + } + /> + ) +} diff --git a/web/src/components/resources/renderers/PVCRenderer.tsx b/web/src/components/resources/renderers/PVCRenderer.tsx index 08a11756e..5fd69c24c 100644 --- a/web/src/components/resources/renderers/PVCRenderer.tsx +++ b/web/src/components/resources/renderers/PVCRenderer.tsx @@ -1 +1,19 @@ -export * from '@skyhook-io/k8s-ui/components/resources/renderers/PVCRenderer' +import { PVCRenderer as BasePVCRenderer } from '@skyhook-io/k8s-ui/components/resources/renderers/PVCRenderer' +import { PVCUsageBar } from '../../resource/PVCUsageBar' + +interface PVCRendererProps { + data: any + onNavigate?: (ref: { kind: string; namespace: string; name: string }) => void +} + +export function PVCRenderer({ data, onNavigate }: PVCRendererProps) { + const namespace = data?.metadata?.namespace ?? '' + const name = data?.metadata?.name ?? '' + return ( + : undefined} + /> + ) +} diff --git a/web/src/components/workload/WorkloadView.tsx b/web/src/components/workload/WorkloadView.tsx index d1dbd25a2..9e458e554 100644 --- a/web/src/components/workload/WorkloadView.tsx +++ b/web/src/components/workload/WorkloadView.tsx @@ -23,6 +23,8 @@ import { fetchJSON, } from '../../api/client' import { PrometheusCharts, isPrometheusSupported } from '../resource/PrometheusCharts' +import { RestartEventLane } from '../resource/RestartChart' +import { RightsizingStrip } from '../resource/RightsizingStrip' import { useResourceAudit, useResources } from '../../api/client' import { AuditAlerts } from '@skyhook-io/k8s-ui' import { WorkloadLogsViewer } from '../logs/WorkloadLogsViewer' @@ -40,6 +42,8 @@ import { ServiceAccountRenderer } from '../resources/renderers/ServiceAccountRen import { RoleRenderer } from '../resources/renderers/RoleRenderer' import { RoleBindingRenderer } from '../resources/renderers/RoleBindingRenderer' import { NamespaceRenderer } from '../resources/renderers/NamespaceRenderer' +import { HPARenderer } from '../resources/renderers/HPARenderer' +import { PVCRenderer } from '../resources/renderers/PVCRenderer' import { CreateResourceDialog } from '../shared/CreateResourceDialog' import { cleanYamlForDuplicate } from '../../utils/skeleton-yaml' import { useDesktopDownload } from '../../hooks/useDesktopDownload' @@ -53,6 +57,8 @@ const rendererOverrides: RendererOverrides = { RoleRenderer, RoleBindingRenderer, NamespaceRenderer, + HPARenderer, + PVCRenderer, } // ============================================================================ @@ -393,7 +399,7 @@ export function WorkloadView({ // Render props renderLogsTab={(props) => } renderMetricsTab={({ kind, namespace: ns, name: n }) => ( - + )} isMetricsAvailable={(kind, res) => isPrometheusSupported(kind) && !(kind === 'Pod' && res?.status?.phase === 'Pending') @@ -660,6 +666,37 @@ function FluxSourceConsumersInner({ sourceKind, namespace, name }: { sourceKind: ) } +// Rightsizing only fits the full-screen layout — drawer mode would cramp the +// chart underneath it. The other two children gate themselves on data. +function MetricsTabContent({ kind, namespace, name, resource, expanded }: { + kind: string + namespace: string + name: string + resource: any + expanded: boolean +}) { + const showRightsizing = expanded && ['Deployment', 'StatefulSet', 'DaemonSet'].includes(kind) + const showRestartLane = kind !== 'Node' + + return ( +
+ {showRightsizing && ( +
+ +
+ )} +
+ +
+ {showRestartLane && ( +
+ +
+ )} +
+ ) +} + // FLUX_SOURCE_KIND_BY_LOWER maps lowercase kind (what the inner WorkloadView // produces via its plural-to-singular fallback) to the wire-correct // PascalCase form that consumers carry in spec.sourceRef.kind. HelmChart is