diff --git a/internal/opencost/handlers.go b/internal/opencost/handlers.go index 38a1d98e9..c6ec8c8f2 100644 --- a/internal/opencost/handlers.go +++ b/internal/opencost/handlers.go @@ -3,16 +3,14 @@ package opencost import ( "encoding/json" "log" - "math" "net/http" - "sort" "strings" - "time" "github.com/go-chi/chi/v5" "github.com/skyhook-io/radar/internal/k8s" prometheuspkg "github.com/skyhook-io/radar/internal/prometheus" + pkgopencost "github.com/skyhook-io/radar/pkg/opencost" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" ) @@ -29,217 +27,16 @@ func RegisterRoutes(r chi.Router) { func handleSummary(w http.ResponseWriter, r *http.Request) { client := prometheuspkg.GetClient() if client == nil { - writeJSON(w, http.StatusOK, CostSummary{Available: false, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.CostSummary{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - // Check if Prometheus is reachable (triggers discovery if needed) - _, _, err := client.EnsureConnected(r.Context()) - if err != nil { + if _, _, err := client.EnsureConnected(r.Context()); err != nil { log.Printf("[opencost] EnsureConnected failed (summary): %v", err) - writeJSON(w, http.StatusOK, CostSummary{Available: false, Reason: ReasonNoPrometheus}) - return - } - - // Query per-namespace CPU cost - // container_cpu_allocation is a gauge (current allocated cores), not a counter — use avg_over_time. - // label_replace handles honor_labels=false setups where Prometheus renames the original - // namespace label to exported_namespace and sets namespace to the scrape target's namespace. - cpuResult, err := client.Query(r.Context(), - `sum by (namespace) (label_replace(avg_over_time(container_cpu_allocation{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") * on(node) group_left() node_cpu_hourly_cost)`) - if err != nil { - // Try the opencost_container metric name variant (this IS a counter, so rate is correct) - cpuResult, err = client.Query(r.Context(), - `sum by (namespace) (label_replace(rate(opencost_container_cpu_cost_total[1h]), "namespace", "$1", "exported_namespace", "(.+)"))`) - if err != nil { - log.Printf("[opencost] CPU cost query failed: %v", err) - writeJSON(w, http.StatusOK, CostSummary{Available: false, Reason: ReasonQueryError}) - return - } - } - - // Query per-namespace memory cost - // container_memory_allocation_bytes is a gauge — use avg_over_time - memResult, err := client.Query(r.Context(), - `sum by (namespace) (label_replace(avg_over_time(container_memory_allocation_bytes{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) - if err != nil { - // Try the opencost_container metric name variant (this IS a counter, so rate is correct) - memResult, err = client.Query(r.Context(), - `sum by (namespace) (label_replace(rate(opencost_container_memory_cost_total[1h]), "namespace", "$1", "exported_namespace", "(.+)"))`) - if err != nil { - log.Printf("[opencost] Memory cost query failed: %v", err) - writeJSON(w, http.StatusOK, CostSummary{Available: false, Reason: ReasonQueryError}) - return - } - } - - // If both queries returned empty results, OpenCost metrics aren't available - if len(cpuResult.Series) == 0 && len(memResult.Series) == 0 { - writeJSON(w, http.StatusOK, CostSummary{Available: false, Reason: ReasonNoMetrics}) + writeJSON(w, http.StatusOK, pkgopencost.CostSummary{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - // Query actual CPU usage cost (for efficiency calculation) - // cAdvisor metrics use "instance" for the node hostname, while OpenCost uses "node", - // so we label_replace to bridge the join. - cpuUsageMap := make(map[string]float64) - cpuUsageResult, err := client.Query(r.Context(), - `sum by (namespace) (label_replace(rate(container_cpu_usage_seconds_total{container!="", namespace!=""}[1h]), "node", "$1", "instance", "(.+?)(?::\\d+)?$") * on(node) group_left() node_cpu_hourly_cost)`) - if err == nil { - for _, s := range cpuUsageResult.Series { - ns := s.Labels["namespace"] - if ns != "" && len(s.DataPoints) > 0 { - cpuUsageMap[ns] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - // Query actual memory usage cost (for efficiency calculation) - memUsageMap := make(map[string]float64) - memUsageResult, err := client.Query(r.Context(), - `sum by (namespace) (label_replace(container_memory_working_set_bytes{container!="", namespace!=""}, "node", "$1", "instance", "(.+?)(?::\\d+)?$") / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) - if err == nil { - for _, s := range memUsageResult.Series { - ns := s.Labels["namespace"] - if ns != "" && len(s.DataPoints) > 0 { - memUsageMap[ns] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - // Query storage (PV) cost per namespace - storageMap := make(map[string]float64) - storageResult, err := client.Query(r.Context(), - `sum by (namespace) (pv_hourly_cost * on(persistentvolume) group_left(namespace) kube_persistentvolume_claim_ref)`) - if err == nil { - for _, s := range storageResult.Series { - ns := s.Labels["namespace"] - if ns != "" && len(s.DataPoints) > 0 { - storageMap[ns] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - // Build per-namespace cost map - nsMap := make(map[string]*NamespaceCost) - - for _, s := range cpuResult.Series { - ns := s.Labels["namespace"] - if ns == "" { - continue - } - if _, ok := nsMap[ns]; !ok { - nsMap[ns] = &NamespaceCost{Name: ns} - } - if len(s.DataPoints) > 0 { - nsMap[ns].CPUCost = s.DataPoints[len(s.DataPoints)-1].Value - } - } - - for _, s := range memResult.Series { - ns := s.Labels["namespace"] - if ns == "" { - continue - } - if _, ok := nsMap[ns]; !ok { - nsMap[ns] = &NamespaceCost{Name: ns} - } - if len(s.DataPoints) > 0 { - nsMap[ns].MemoryCost = s.DataPoints[len(s.DataPoints)-1].Value - } - } - - // Calculate totals - var totalHourlyCost, totalStorageCost, totalUsageCost, totalAllocCost float64 - namespaces := make([]NamespaceCost, 0, len(nsMap)) - for _, nc := range nsMap { - nc.HourlyCost = nc.CPUCost + nc.MemoryCost - nc.StorageCost = storageMap[nc.Name] - nc.HourlyCost += nc.StorageCost - totalStorageCost += nc.StorageCost - - // Efficiency - nc.CPUUsageCost = cpuUsageMap[nc.Name] - nc.MemoryUsageCost = memUsageMap[nc.Name] - allocCost := nc.CPUCost + nc.MemoryCost // allocation cost (excl storage) - usageCost := nc.CPUUsageCost + nc.MemoryUsageCost - if allocCost > 0 && usageCost > 0 { - nc.Efficiency = roundTo((usageCost/allocCost)*100, 1) - if nc.Efficiency > 100 { - nc.Efficiency = 100 - } - nc.IdleCost = allocCost - usageCost - if nc.IdleCost < 0 { - nc.IdleCost = 0 - } - } - totalAllocCost += allocCost - totalUsageCost += usageCost - - totalHourlyCost += nc.HourlyCost - namespaces = append(namespaces, *nc) - } - - // Also try to get node-level total cost for a more accurate total - nodeResult, err := client.Query(r.Context(), `sum(node_total_hourly_cost)`) - if err == nil && len(nodeResult.Series) > 0 && len(nodeResult.Series[0].DataPoints) > 0 { - nodeCost := nodeResult.Series[0].DataPoints[0].Value - if nodeCost > totalHourlyCost { - totalHourlyCost = nodeCost - } - } - - // Sort by cost descending - sort.Slice(namespaces, func(i, j int) bool { - return namespaces[i].HourlyCost > namespaces[j].HourlyCost - }) - - // Cluster-level efficiency - var clusterEfficiency float64 - var totalIdleCost float64 - if totalAllocCost > 0 && totalUsageCost > 0 { - clusterEfficiency = roundTo((totalUsageCost/totalAllocCost)*100, 1) - if clusterEfficiency > 100 { - clusterEfficiency = 100 - } - totalIdleCost = totalAllocCost - totalUsageCost - if totalIdleCost < 0 { - totalIdleCost = 0 - } - } - - // Round to 4 decimal places for cleaner JSON - totalHourlyCost = roundTo(totalHourlyCost, 4) - totalStorageCost = roundTo(totalStorageCost, 4) - totalIdleCost = roundTo(totalIdleCost, 4) - for i := range namespaces { - namespaces[i].HourlyCost = roundTo(namespaces[i].HourlyCost, 4) - namespaces[i].CPUCost = roundTo(namespaces[i].CPUCost, 4) - namespaces[i].MemoryCost = roundTo(namespaces[i].MemoryCost, 4) - namespaces[i].StorageCost = roundTo(namespaces[i].StorageCost, 4) - namespaces[i].CPUUsageCost = roundTo(namespaces[i].CPUUsageCost, 4) - namespaces[i].MemoryUsageCost = roundTo(namespaces[i].MemoryUsageCost, 4) - namespaces[i].IdleCost = roundTo(namespaces[i].IdleCost, 4) - } - - writeJSON(w, http.StatusOK, CostSummary{ - Available: true, - Currency: "USD", - Window: "1h", - TotalHourlyCost: totalHourlyCost, - TotalStorageCost: totalStorageCost, - TotalIdleCost: totalIdleCost, - ClusterEfficiency: clusterEfficiency, - Namespaces: namespaces, - }) -} - -func roundTo(val float64, places int) float64 { - if math.IsNaN(val) || math.IsInf(val, 0) { - return 0 - } - pow := math.Pow(10, float64(places)) - return math.Round(val*pow) / pow + writeJSON(w, http.StatusOK, pkgopencost.ComputeCostSummaryFromProm( + r.Context(), client.Prom(), pkgopencost.SummaryOptions{})) } func writeJSON(w http.ResponseWriter, status int, v interface{}) { @@ -250,12 +47,6 @@ func writeJSON(w http.ResponseWriter, status int, v interface{}) { } } -// workloadKey identifies a workload by name and kind for aggregation. -type workloadKey struct { - name string - kind string -} - // handleWorkloads returns workload-level cost breakdown for a namespace. func handleWorkloads(w http.ResponseWriter, r *http.Request) { ns := r.URL.Query().Get("namespace") @@ -266,423 +57,91 @@ func handleWorkloads(w http.ResponseWriter, r *http.Request) { client := prometheuspkg.GetClient() if client == nil { - writeJSON(w, http.StatusOK, WorkloadCostResponse{Namespace: ns, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.WorkloadCostResponse{Namespace: ns, Reason: pkgopencost.ReasonNoPrometheus}) return } - - _, _, err := client.EnsureConnected(r.Context()) - if err != nil { + if _, _, err := client.EnsureConnected(r.Context()); err != nil { log.Printf("[opencost] EnsureConnected failed (workloads): %v", err) - writeJSON(w, http.StatusOK, WorkloadCostResponse{Namespace: ns, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.WorkloadCostResponse{Namespace: ns, Reason: pkgopencost.ReasonNoPrometheus}) return } - // Sanitize namespace for safe PromQL label interpolation - safeNS := prometheuspkg.SanitizeLabelValue(ns) - - // Query per-pod CPU cost in this namespace. - // Use "or" to handle both honor_labels configurations: - // exported_namespace="X" → honor_labels=false (namespace was renamed) - // namespace="X", exported_namespace="" → honor_labels=true (no renaming, label absent) - cpuQuery := `sum by (pod) ((avg_over_time(container_cpu_allocation{exported_namespace="` + safeNS + `"}[1h]) or avg_over_time(container_cpu_allocation{namespace="` + safeNS + `", exported_namespace=""}[1h])) * on(node) group_left() node_cpu_hourly_cost)` - cpuResult, err := client.Query(r.Context(), cpuQuery) - if err != nil { - cpuQuery = `sum by (pod) (rate(opencost_container_cpu_cost_total{exported_namespace="` + safeNS + `"}[1h]) or rate(opencost_container_cpu_cost_total{namespace="` + safeNS + `", exported_namespace=""}[1h]))` - cpuResult, err = client.Query(r.Context(), cpuQuery) - if err != nil { - log.Printf("[opencost] Workload CPU cost query failed for %s: %v", ns, err) - writeJSON(w, http.StatusOK, WorkloadCostResponse{Namespace: ns, Reason: ReasonQueryError}) - return - } - } - - // Query per-pod memory cost in this namespace - memQuery := `sum by (pod) ((avg_over_time(container_memory_allocation_bytes{exported_namespace="` + safeNS + `"}[1h]) or avg_over_time(container_memory_allocation_bytes{namespace="` + safeNS + `", exported_namespace=""}[1h])) / 1073741824 * on(node) group_left() node_ram_hourly_cost)` - memResult, err := client.Query(r.Context(), memQuery) - if err != nil { - memQuery = `sum by (pod) (rate(opencost_container_memory_cost_total{exported_namespace="` + safeNS + `"}[1h]) or rate(opencost_container_memory_cost_total{namespace="` + safeNS + `", exported_namespace=""}[1h]))` - memResult, err = client.Query(r.Context(), memQuery) - if err != nil { - log.Printf("[opencost] Workload memory cost query failed for %s: %v", ns, err) - writeJSON(w, http.StatusOK, WorkloadCostResponse{Namespace: ns, Reason: ReasonQueryError}) - return - } - } - - // Query per-pod CPU usage cost (for efficiency) - podCPUUsage := make(map[string]float64) - cpuUsageQuery := `sum by (pod) (label_replace(rate(container_cpu_usage_seconds_total{container!="", namespace="` + safeNS + `"}[1h]), "node", "$1", "instance", "(.+?)(?::\\d+)?$") * on(node) group_left() node_cpu_hourly_cost)` - cpuUsageResult, usageErr := client.Query(r.Context(), cpuUsageQuery) - if usageErr == nil { - for _, s := range cpuUsageResult.Series { - pod := s.Labels["pod"] - if pod != "" && len(s.DataPoints) > 0 { - podCPUUsage[pod] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - // Query per-pod memory usage cost (for efficiency) - podMemUsage := make(map[string]float64) - memUsageQuery := `sum by (pod) (label_replace(container_memory_working_set_bytes{container!="", namespace="` + safeNS + `"}, "node", "$1", "instance", "(.+?)(?::\\d+)?$") / 1073741824 * on(node) group_left() node_ram_hourly_cost)` - memUsageResult, usageErr := client.Query(r.Context(), memUsageQuery) - if usageErr == nil { - for _, s := range memUsageResult.Series { - pod := s.Labels["pod"] - if pod != "" && len(s.DataPoints) > 0 { - podMemUsage[pod] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - // Build per-pod cost map - type podCost struct { - cpuCost float64 - memoryCost float64 - cpuUsage float64 - memoryUsage float64 - } - podCosts := make(map[string]*podCost) - - for _, s := range cpuResult.Series { - pod := s.Labels["pod"] - if pod == "" { - continue - } - if _, ok := podCosts[pod]; !ok { - podCosts[pod] = &podCost{} - } - if len(s.DataPoints) > 0 { - podCosts[pod].cpuCost = s.DataPoints[len(s.DataPoints)-1].Value - } - } - - for _, s := range memResult.Series { - pod := s.Labels["pod"] - if pod == "" { - continue - } - if _, ok := podCosts[pod]; !ok { - podCosts[pod] = &podCost{} - } - if len(s.DataPoints) > 0 { - podCosts[pod].memoryCost = s.DataPoints[len(s.DataPoints)-1].Value - } - } - - // Merge usage data into pod costs - for pod, pc := range podCosts { - pc.cpuUsage = podCPUUsage[pod] - pc.memoryUsage = podMemUsage[pod] - } + writeJSON(w, http.StatusOK, pkgopencost.ComputeWorkloadsFromProm( + r.Context(), client.Prom(), ns, buildPodOwnerLookup(ns))) +} - // Resolve pod -> workload using K8s cache owner references - podOwnerMap := make(map[string]workloadKey) +// buildPodOwnerLookup snapshots radar's pod informer for `ns` so +// pkg/opencost.ComputeWorkloadsFromProm can resolve pod→workload without +// depending on client-go. +func buildPodOwnerLookup(ns string) pkgopencost.PodOwnerLookup { rc := k8s.GetResourceCache() - if rc != nil && rc.Pods() != nil { - pods, _ := rc.Pods().Pods(ns).List(labels.Everything()) - for _, p := range pods { - podOwnerMap[p.Name] = resolveOwner(p.OwnerReferences) - } + if rc == nil || rc.Pods() == nil { + return nil } - - workloadMap := make(map[workloadKey]*WorkloadCost) - for podName, pc := range podCosts { - owner, ok := podOwnerMap[podName] - if !ok { - // Fallback: strip pod hash suffixes to guess workload name - owner = workloadKey{name: stripPodSuffix(podName), kind: "standalone"} - } - - wl, exists := workloadMap[owner] - if !exists { - wl = &WorkloadCost{Name: owner.name, Kind: owner.kind} - workloadMap[owner] = wl - } - wl.CPUCost += pc.cpuCost - wl.MemoryCost += pc.memoryCost - wl.CPUUsageCost += pc.cpuUsage - wl.MemoryUsageCost += pc.memoryUsage - wl.Replicas++ + pods, err := rc.Pods().Pods(ns).List(labels.Everything()) + if err != nil || len(pods) == 0 { + return nil } - - // Build sorted result - workloads := make([]WorkloadCost, 0, len(workloadMap)) - for _, wl := range workloadMap { - wl.HourlyCost = wl.CPUCost + wl.MemoryCost - // Compute efficiency - allocCost := wl.CPUCost + wl.MemoryCost - usageCost := wl.CPUUsageCost + wl.MemoryUsageCost - if allocCost > 0 && usageCost > 0 { - wl.Efficiency = roundTo((usageCost/allocCost)*100, 1) - if wl.Efficiency > 100 { - wl.Efficiency = 100 - } - wl.IdleCost = allocCost - usageCost - if wl.IdleCost < 0 { - wl.IdleCost = 0 - } - } - wl.HourlyCost = roundTo(wl.HourlyCost, 4) - wl.CPUCost = roundTo(wl.CPUCost, 4) - wl.MemoryCost = roundTo(wl.MemoryCost, 4) - wl.CPUUsageCost = roundTo(wl.CPUUsageCost, 4) - wl.MemoryUsageCost = roundTo(wl.MemoryUsageCost, 4) - wl.IdleCost = roundTo(wl.IdleCost, 4) - workloads = append(workloads, *wl) + owners := make(map[string]pkgopencost.WorkloadOwner, len(pods)) + for _, p := range pods { + owners[p.Name] = resolvePodOwner(p.OwnerReferences) + } + return func(podName string) (pkgopencost.WorkloadOwner, bool) { + o, ok := owners[podName] + return o, ok } - sort.Slice(workloads, func(i, j int) bool { - return workloads[i].HourlyCost > workloads[j].HourlyCost - }) - - writeJSON(w, http.StatusOK, WorkloadCostResponse{ - Available: true, - Namespace: ns, - Workloads: workloads, - }) } -// resolveOwner walks owner references to find the top-level workload. -// For pods owned by ReplicaSets, it strips the RS hash suffix to get the Deployment name. -func resolveOwner(owners []metav1.OwnerReference) workloadKey { - if len(owners) == 0 { - return workloadKey{kind: "standalone"} +// resolvePodOwner walks owner references to find the top-level workload. +// Pods owned by a ReplicaSet are mapped back to the parent Deployment by +// stripping the RS hash suffix. +func resolvePodOwner(refs []metav1.OwnerReference) pkgopencost.WorkloadOwner { + if len(refs) == 0 { + return pkgopencost.WorkloadOwner{Kind: "standalone"} } - - owner := owners[0] - - // If owned by a ReplicaSet, strip hash suffix to get the Deployment name + owner := refs[0] if owner.Kind == "ReplicaSet" { - deployName := stripReplicaSetSuffix(owner.Name) - if deployName != owner.Name { - return workloadKey{name: deployName, kind: "Deployment"} + if deployName := stripReplicaSetSuffix(owner.Name); deployName != owner.Name { + return pkgopencost.WorkloadOwner{Name: deployName, Kind: "Deployment"} } } - - return workloadKey{name: owner.Name, kind: owner.Kind} + return pkgopencost.WorkloadOwner{Name: owner.Name, Kind: owner.Kind} } -// stripReplicaSetSuffix removes the hash suffix from a ReplicaSet name -// (e.g., "myapp-7f8d9c" -> "myapp"). func stripReplicaSetSuffix(name string) string { - idx := strings.LastIndex(name, "-") - if idx > 0 { + if idx := strings.LastIndex(name, "-"); idx > 0 { return name[:idx] } return name } -// stripPodSuffix removes pod hash suffixes to approximate the workload name. -// e.g., "myapp-7f8d9c-xyz12" -> "myapp" -func stripPodSuffix(name string) string { - // Strip last segment (pod hash) - idx := strings.LastIndex(name, "-") - if idx <= 0 { - return name - } - name = name[:idx] - // Strip RS hash segment - idx = strings.LastIndex(name, "-") - if idx <= 0 { - return name - } - return name[:idx] -} - -// parseCostTimeRange parses the "range" query parameter into start/end/step for cost trends. -func parseCostTimeRange(rangeStr string) (start, end time.Time, step time.Duration, label string) { - end = time.Now() - switch rangeStr { - case "6h": - start = end.Add(-6 * time.Hour) - step = 15 * time.Minute - label = "6h" - case "7d": - start = end.Add(-7 * 24 * time.Hour) - step = 6 * time.Hour - label = "7d" - default: // "24h" - start = end.Add(-24 * time.Hour) - step = time.Hour - label = "24h" - } - return -} - // handleTrend returns cost trend data over time as a stacked series per namespace. func handleTrend(w http.ResponseWriter, r *http.Request) { client := prometheuspkg.GetClient() if client == nil { - writeJSON(w, http.StatusOK, CostTrendResponse{Available: false, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.CostTrendResponse{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - _, _, err := client.EnsureConnected(r.Context()) - if err != nil { + if _, _, err := client.EnsureConnected(r.Context()); err != nil { log.Printf("[opencost] EnsureConnected failed (trend): %v", err) - writeJSON(w, http.StatusOK, CostTrendResponse{Available: false, Reason: ReasonNoPrometheus}) - return - } - - rangeStr := r.URL.Query().Get("range") - start, end, step, label := parseCostTimeRange(rangeStr) - - // Combined CPU + memory allocation cost per namespace over time. - // label_replace normalises exported_namespace → namespace when honor_labels=false. - query := `sum by (namespace) ( - label_replace(avg_over_time(container_cpu_allocation{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") * on(node) group_left() node_cpu_hourly_cost -) + sum by (namespace) ( - label_replace(avg_over_time(container_memory_allocation_bytes{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") / 1073741824 * on(node) group_left() node_ram_hourly_cost -)` - - result, err := client.QueryRange(r.Context(), query, start, end, step) - if err != nil { - log.Printf("[opencost] Trend query failed: %v", err) - writeJSON(w, http.StatusOK, CostTrendResponse{Available: false, Reason: ReasonQueryError}) + writeJSON(w, http.StatusOK, pkgopencost.CostTrendResponse{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - if len(result.Series) == 0 { - writeJSON(w, http.StatusOK, CostTrendResponse{Available: false, Reason: ReasonNoMetrics}) - return - } - - // Rank namespaces by latest cost to pick top 8 - type nsRank struct { - ns string - lastCost float64 - idx int - } - ranks := make([]nsRank, 0, len(result.Series)) - for i, s := range result.Series { - ns := s.Labels["namespace"] - if ns == "" { - continue - } - var last float64 - if len(s.DataPoints) > 0 { - last = s.DataPoints[len(s.DataPoints)-1].Value - } - ranks = append(ranks, nsRank{ns: ns, lastCost: last, idx: i}) - } - sort.Slice(ranks, func(i, j int) bool { return ranks[i].lastCost > ranks[j].lastCost }) - - const maxSeries = 8 - topSet := make(map[int]bool) - series := make([]CostTrendSeries, 0, maxSeries+1) - for i, r := range ranks { - if i >= maxSeries { - break - } - topSet[r.idx] = true - s := result.Series[r.idx] - dps := make([]CostDataPoint, 0, len(s.DataPoints)) - for _, dp := range s.DataPoints { - dps = append(dps, CostDataPoint{Timestamp: dp.Timestamp, Value: roundTo(dp.Value, 4)}) - } - series = append(series, CostTrendSeries{Namespace: r.ns, DataPoints: dps}) - } - - // Aggregate remaining into "other" - if len(ranks) > maxSeries { - // Collect all timestamps from any overflow series - otherMap := make(map[int64]float64) - for i, s := range result.Series { - if topSet[i] { - continue - } - for _, dp := range s.DataPoints { - otherMap[dp.Timestamp] += dp.Value - } - } - if len(otherMap) > 0 { - dps := make([]CostDataPoint, 0, len(otherMap)) - for ts, val := range otherMap { - dps = append(dps, CostDataPoint{Timestamp: ts, Value: roundTo(val, 4)}) - } - sort.Slice(dps, func(i, j int) bool { return dps[i].Timestamp < dps[j].Timestamp }) - series = append(series, CostTrendSeries{Namespace: "other", DataPoints: dps}) - } - } - - writeJSON(w, http.StatusOK, CostTrendResponse{ - Available: true, - Range: label, - Series: series, - }) + writeJSON(w, http.StatusOK, pkgopencost.ComputeCostTrendFromProm( + r.Context(), client.Prom(), pkgopencost.TrendPromOptions{Range: r.URL.Query().Get("range")})) } // handleNodes returns per-node cost breakdown. func handleNodes(w http.ResponseWriter, r *http.Request) { client := prometheuspkg.GetClient() if client == nil { - writeJSON(w, http.StatusOK, NodeCostResponse{Available: false, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.NodeCostResponse{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - _, _, err := client.EnsureConnected(r.Context()) - if err != nil { + if _, _, err := client.EnsureConnected(r.Context()); err != nil { log.Printf("[opencost] EnsureConnected failed (nodes): %v", err) - writeJSON(w, http.StatusOK, NodeCostResponse{Available: false, Reason: ReasonNoPrometheus}) + writeJSON(w, http.StatusOK, pkgopencost.NodeCostResponse{Available: false, Reason: pkgopencost.ReasonNoPrometheus}) return } - - // Query per-node total hourly cost (includes labels: node, instance_type, region) - totalResult, err := client.Query(r.Context(), `node_total_hourly_cost`) - if err != nil { - log.Printf("[opencost] Node cost query failed: %v", err) - writeJSON(w, http.StatusOK, NodeCostResponse{Available: false, Reason: ReasonQueryError}) - return - } - if len(totalResult.Series) == 0 { - writeJSON(w, http.StatusOK, NodeCostResponse{Available: false, Reason: ReasonNoMetrics}) - return - } - - // Query per-node CPU and memory costs - cpuMap := make(map[string]float64) - cpuResult, err := client.Query(r.Context(), `node_cpu_hourly_cost`) - if err == nil { - for _, s := range cpuResult.Series { - node := s.Labels["node"] - if node != "" && len(s.DataPoints) > 0 { - cpuMap[node] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - memMap := make(map[string]float64) - memResult, err := client.Query(r.Context(), `node_ram_hourly_cost`) - if err == nil { - for _, s := range memResult.Series { - node := s.Labels["node"] - if node != "" && len(s.DataPoints) > 0 { - memMap[node] = s.DataPoints[len(s.DataPoints)-1].Value - } - } - } - - nodes := make([]NodeCost, 0, len(totalResult.Series)) - for _, s := range totalResult.Series { - node := s.Labels["node"] - if node == "" || len(s.DataPoints) == 0 { - continue - } - nc := NodeCost{ - Name: node, - InstanceType: s.Labels["instance_type"], - Region: s.Labels["region"], - HourlyCost: roundTo(s.DataPoints[len(s.DataPoints)-1].Value, 4), - CPUCost: roundTo(cpuMap[node], 4), - MemoryCost: roundTo(memMap[node], 4), - } - nodes = append(nodes, nc) - } - - sort.Slice(nodes, func(i, j int) bool { return nodes[i].HourlyCost > nodes[j].HourlyCost }) - - writeJSON(w, http.StatusOK, NodeCostResponse{ - Available: true, - Nodes: nodes, - }) + writeJSON(w, http.StatusOK, pkgopencost.ComputeNodeCosts(r.Context(), client.Prom())) } diff --git a/internal/prometheus/client.go b/internal/prometheus/client.go index 97660a51c..6fd4441fa 100644 --- a/internal/prometheus/client.go +++ b/internal/prometheus/client.go @@ -2,14 +2,10 @@ package prometheus import ( "context" - "encoding/json" - "fmt" - "io" + "errors" "log" "maps" "net/http" - "net/url" - "strconv" "strings" "sync" "time" @@ -18,19 +14,24 @@ import ( "k8s.io/client-go/rest" "github.com/skyhook-io/radar/internal/errorlog" + "github.com/skyhook-io/radar/pkg/prom" ) -// Client is a Prometheus HTTP API client with auto-discovery. +// Client is radar's application-scoped Prometheus client. It holds the +// K8s-aware state required for kubectl-like port-forward discovery, along +// with a pkg/prom.Client that performs the actual HTTP calls once an +// endpoint has been discovered. type Client struct { mu sync.RWMutex - // Discovered/configured connection - baseURL string // e.g. "http://localhost:54321" or "http://prometheus.monitoring.svc:9090" - basePath string // e.g. "/select/0/prometheus" for vmselect + // Effective connection (populated after discover succeeds). + baseURL string + basePath string + prom *prom.Client // rebuilt whenever baseURL/basePath changes // Discovery state discovered bool - discoveryService *ServiceInfo // discovered service info for port-forward + discoveryService *prom.ServiceInfo // discovered service info for port-forward manualURL string // --prometheus-url override headers map[string]string @@ -39,27 +40,10 @@ type Client struct { k8sConfig *rest.Config contextName string + // Shared HTTP client used when constructing the underlying pkg/prom.Client. httpClient *http.Client } -// ServiceInfo holds info about a discovered Prometheus service. -type ServiceInfo struct { - Namespace string `json:"namespace"` - Name string `json:"name"` - Port int `json:"port"` - BasePath string `json:"basePath,omitempty"` -} - -// Status represents the current Prometheus connection status. -type Status struct { - Available bool `json:"available"` - Connected bool `json:"connected"` - Address string `json:"address,omitempty"` - Service *ServiceInfo `json:"service,omitempty"` - ContextName string `json:"contextName,omitempty"` - Error string `json:"error,omitempty"` -} - // Global client instance var ( globalClient *Client @@ -75,9 +59,7 @@ func Initialize(client kubernetes.Interface, config *rest.Config, contextName st k8sClient: client, k8sConfig: config, contextName: contextName, - httpClient: &http.Client{ - Timeout: 10 * time.Second, - }, + httpClient: &http.Client{Timeout: 10 * time.Second}, } } @@ -102,6 +84,9 @@ func SetHeaders(h map[string]string) { c.mu.Lock() defer c.mu.Unlock() c.headers = copyHeaders(h) + // Drop the cached prom.Client so the next request rebuilds its transport + // with the new headers. + c.prom = nil } func copyHeaders(h map[string]string) map[string]string { @@ -113,18 +98,6 @@ func copyHeaders(h map[string]string) map[string]string { return out } -// SetURL overrides discovery with a specific Prometheus URL. -// Clears existing connection state so the next EnsureConnected uses this URL. -func (c *Client) SetURL(rawURL string) { - c.mu.Lock() - defer c.mu.Unlock() - c.manualURL = strings.TrimRight(rawURL, "/") - c.baseURL = "" - c.basePath = "" - c.discovered = false - c.discoveryService = nil -} - // GetClient returns the global Prometheus client (may be nil). func GetClient() *Client { clientMu.RLock() @@ -140,6 +113,7 @@ func Reset() { globalClient.mu.Lock() globalClient.baseURL = "" globalClient.basePath = "" + globalClient.prom = nil globalClient.discovered = false globalClient.discoveryService = nil globalClient.mu.Unlock() @@ -154,10 +128,10 @@ func Reinitialize(client kubernetes.Interface, config *rest.Config, contextName manualURL := "" var headers map[string]string if globalClient != nil { - // SetURL / SetHeaders write these under the per-client mutex after - // dropping clientMu, so reading without c.mu here would race even - // though we hold clientMu exclusively. copyHeaders also detaches the - // map from the old client so a late mutation can't bleed through. + // SetManualURL / SetHeaders write these under the per-client mutex + // after dropping clientMu, so reading without c.mu here would race + // even though we hold clientMu exclusively. copyHeaders also detaches + // the map from the old client so a late mutation can't bleed through. globalClient.mu.RLock() manualURL = globalClient.manualURL headers = copyHeaders(globalClient.headers) @@ -170,24 +144,22 @@ func Reinitialize(client kubernetes.Interface, config *rest.Config, contextName contextName: contextName, manualURL: manualURL, headers: headers, - httpClient: &http.Client{ - Timeout: 10 * time.Second, - }, + httpClient: &http.Client{Timeout: 10 * time.Second}, } } // GetStatus returns the current Prometheus connection status. -func (c *Client) GetStatus() Status { +func (c *Client) GetStatus() prom.Status { c.mu.RLock() defer c.mu.RUnlock() - var svc *ServiceInfo + var svc *prom.ServiceInfo if c.discoveryService != nil { cp := *c.discoveryService svc = &cp } - return Status{ + return prom.Status{ Available: c.baseURL != "", Connected: c.baseURL != "", Address: c.baseURL, @@ -196,277 +168,145 @@ func (c *Client) GetStatus() Status { } } -// EnsureConnected attempts to discover and connect to Prometheus if not already connected. -// Returns the base URL and base path, or an error. +// EnsureConnected attempts to discover and connect to Prometheus if not +// already connected. Returns the base URL and base path, or an error. func (c *Client) EnsureConnected(ctx context.Context) (string, string, error) { c.mu.RLock() - if c.baseURL != "" { - // Verify cached address still works - base := c.baseURL - bp := c.basePath - c.mu.RUnlock() - if c.probe(ctx, base+bp) { - return base, bp, nil + base := c.baseURL + bp := c.basePath + c.mu.RUnlock() + + if base != "" { + // Probe whatever we already have, building the pkg/prom.Client + // on-demand. The cached client may be nil here for two reasons: + // (a) a concurrent request hasn't yet primed getPromClient, or + // (b) SetHeaders cleared the cache to force a header reload. + // In both cases the connection itself is still valid; only the + // cached client wrapper needs rebuilding. Pre-extraction probed + // solely on base!="", so this preserves that behavior. + if p := c.getPromClient(); p != nil { + ok, reason := p.Probe(ctx) + if ok { + return base, bp, nil + } + log.Printf("[prometheus] cached connection to %s failed probe (reason=%s), rediscovering", base, reason) + c.mu.Lock() + c.baseURL = "" + c.basePath = "" + c.prom = nil + c.discovered = false + c.mu.Unlock() } - // Stale — clear and rediscover - c.mu.Lock() - c.baseURL = "" - c.basePath = "" - c.discovered = false - c.mu.Unlock() - } else { - c.mu.RUnlock() } return c.discover(ctx) } -// QueryRange executes a Prometheus range query. -func (c *Client) QueryRange(ctx context.Context, query string, start, end time.Time, step time.Duration) (*QueryResult, error) { - base, basePath, err := c.EnsureConnected(ctx) - if err != nil { - return nil, err - } - - params := url.Values{ - "query": {query}, - "start": {strconv.FormatInt(start.Unix(), 10)}, - "end": {strconv.FormatInt(end.Unix(), 10)}, - "step": {fmt.Sprintf("%.0f", step.Seconds())}, - } - - reqURL := fmt.Sprintf("%s%s/api/v1/query_range?%s", base, basePath, params.Encode()) - return c.doQuery(ctx, reqURL) +// Prom returns the underlying pkg/prom.Client for callers that compose +// cost math on top of raw Query/QueryRange (e.g., +// pkg/opencost.ComputeCostSummaryFromProm). Unlike Query/QueryRange this +// does NOT call EnsureConnected; callers must have done so to ensure a +// baseURL is set. Returns nil if discovery has not run. +func (c *Client) Prom() *prom.Client { + return c.getPromClient() } -// Query executes a Prometheus instant query. -func (c *Client) Query(ctx context.Context, query string) (*QueryResult, error) { - base, basePath, err := c.EnsureConnected(ctx) - if err != nil { - return nil, err - } - - params := url.Values{ - "query": {query}, - } - - reqURL := fmt.Sprintf("%s%s/api/v1/query?%s", base, basePath, params.Encode()) - return c.doQuery(ctx, reqURL) -} - -func (c *Client) doQuery(ctx context.Context, reqURL string) (*QueryResult, error) { - req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) - if err != nil { - return nil, fmt.Errorf("creating request: %w", err) - } - c.applyHeaders(req) - - resp, err := c.httpClient.Do(req) - if err != nil { - errorlog.Record("prometheus", "error", "HTTP request failed: %v", err) - return nil, fmt.Errorf("querying prometheus: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB cap - if err != nil { - return nil, fmt.Errorf("reading response: %w", err) - } - - if resp.StatusCode != http.StatusOK { - errorlog.Record("prometheus", "error", "returned status %d: %s", resp.StatusCode, string(body)) - return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) +// getPromClient returns a pkg/prom.Client pointed at the current baseURL/basePath, +// building (and caching) one if necessary. Callers must hold the read or +// write lock appropriately; see QueryRange/Query. +func (c *Client) getPromClient() *prom.Client { + c.mu.RLock() + if c.prom != nil { + p := c.prom + c.mu.RUnlock() + return p } + base, bp, httpC := c.baseURL, c.basePath, c.httpClient + headers := copyHeaders(c.headers) + c.mu.RUnlock() - var promResp promResponse - if err := json.Unmarshal(body, &promResp); err != nil { - return nil, fmt.Errorf("parsing response: %w", err) + if base == "" { + return nil } - if promResp.Status != "success" { - return nil, fmt.Errorf("prometheus error: %s (%s)", promResp.Error, promResp.ErrorType) + tr := prom.NewHTTPTransport(base, bp, httpC) + tr.Headers = headers + p := prom.NewClient(tr) + c.mu.Lock() + // Double-check in case another goroutine built one. + if c.prom == nil { + c.prom = p + } else { + p = c.prom } - - return parseQueryResult(promResp.Data) + c.mu.Unlock() + return p } -// applyHeaders attaches the configured custom headers to req under the -// client's read lock, so a concurrent SetHeaders / Reinitialize doesn't race. -func (c *Client) applyHeaders(req *http.Request) { +// probe checks if a Prometheus endpoint at `addr` is reachable and has at +// least one active scrape target, using pkg/prom.Client.Probe. Records a +// targeted log entry for every non-OK outcome so operators can see why a +// candidate was rejected — particularly important for auth failures (401/403) +// and empty instances, which would otherwise silently fall through the +// discovery candidate list. +func (c *Client) probe(ctx context.Context, addr string) bool { c.mu.RLock() - defer c.mu.RUnlock() - for k, v := range c.headers { - req.Header.Set(k, v) + httpC := c.httpClient + headers := copyHeaders(c.headers) + c.mu.RUnlock() + tr := prom.NewHTTPTransport(addr, "", httpC) + tr.Headers = headers + ok, reason := prom.NewClient(tr).Probe(ctx) + if !ok { + logProbeRejection(addr, reason) } + return ok } -// probe checks if a Prometheus endpoint is reachable and has data. -// An instance that responds HTTP 200 but returns zero results for "up" -// (no active scrape targets) is treated as unreachable so discovery -// continues to the next candidate. -func (c *Client) probe(ctx context.Context, addr string) bool { - testCtx, cancel := context.WithTimeout(ctx, 3*time.Second) - defer cancel() - - req, err := http.NewRequestWithContext(testCtx, "GET", addr+"/api/v1/query?query=up", nil) - if err != nil { - return false +// logProbeRejection records an appropriate log entry for each rejection +// reason. Auth failures get errorlog at error level (likely operator +// misconfiguration); empty instances get warning level (cluster state); +// other failures use stdlib log so they appear in the discovery audit +// trail without flooding errorlog. +func logProbeRejection(addr string, reason prom.ProbeReason) { + switch reason { + case prom.ProbeReasonAuthError: + errorlog.Record("prometheus", "error", + "endpoint %s rejected credentials (HTTP 401/403, check --prometheus-header)", addr) + case prom.ProbeReasonEmptyInstance: + errorlog.Record("prometheus", "warning", + "endpoint %s has no active scrape targets (empty instance), skipping", addr) + case prom.ProbeReasonNotPrometheus: + log.Printf("[prometheus] endpoint %s responded but not in Prometheus format, skipping", addr) + case prom.ProbeReasonPromError: + log.Printf("[prometheus] endpoint %s returned Prometheus error status, skipping", addr) + case prom.ProbeReasonTransportError: + log.Printf("[prometheus] endpoint %s unreachable, skipping", addr) } - c.applyHeaders(req) - - resp, err := c.httpClient.Do(req) - if err != nil { - return false - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - // Surface auth failures explicitly — otherwise a misconfigured Bearer - // token shows up as "Prometheus not found" after discovery falls - // through every candidate. - if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { - errorlog.Record("prometheus", "error", "endpoint %s returned HTTP %d (check --prometheus-header credentials)", addr, resp.StatusCode) - } - return false - } - - // Verify the instance actually has scrape targets. An empty VictoriaMetrics - // or Prometheus instance returns 200 with zero results — skip it. - // 10 MB matches doQuery's limit so a large cluster's `up` response fits. - body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) - if err != nil { - return false - } - var promResp struct { - Status string `json:"status"` - Data struct { - Result []json.RawMessage `json:"result"` - } `json:"data"` - } - if err := json.Unmarshal(body, &promResp); err != nil { - // A 200 response that isn't Prometheus JSON is almost certainly not - // Prometheus (captive portal, ingress login page, misconfigured proxy). - return false - } - if promResp.Status != "success" { - // Some proxies return 200 with a Prometheus-shaped error body. - return false - } - if len(promResp.Data.Result) == 0 { - errorlog.Record("prometheus", "warning", "endpoint %s has no active scrape targets (empty instance), skipping", addr) - return false - } - return true } -// Prometheus API response types - -type promResponse struct { - Status string `json:"status"` - Data json.RawMessage `json:"data"` - ErrorType string `json:"errorType,omitempty"` - Error string `json:"error,omitempty"` -} - -// QueryResult is the parsed result of a Prometheus query. -type QueryResult struct { - ResultType string `json:"resultType"` - Series []Series `json:"series"` -} - -// Series is a single time series from a Prometheus query. -type Series struct { - Labels map[string]string `json:"labels"` - DataPoints []DataPoint `json:"dataPoints"` -} - -// DataPoint is a single (timestamp, value) pair. -type DataPoint struct { - Timestamp int64 `json:"timestamp"` - Value float64 `json:"value"` -} - -func parseQueryResult(data json.RawMessage) (*QueryResult, error) { - var raw struct { - ResultType string `json:"resultType"` - Result []struct { - Metric map[string]string `json:"metric"` - Values [][]interface{} `json:"values"` // for matrix - Value []interface{} `json:"value"` // for vector - } `json:"result"` - } - - if err := json.Unmarshal(data, &raw); err != nil { - return nil, fmt.Errorf("parsing result: %w", err) - } - - result := &QueryResult{ - ResultType: raw.ResultType, - Series: make([]Series, 0, len(raw.Result)), +// QueryRange executes a Prometheus range query via the underlying pkg/prom.Client. +func (c *Client) QueryRange(ctx context.Context, query string, start, end time.Time, step time.Duration) (*prom.QueryResult, error) { + if _, _, err := c.EnsureConnected(ctx); err != nil { + return nil, err } - - for _, r := range raw.Result { - series := Series{ - Labels: r.Metric, - } - - if raw.ResultType == "matrix" { - series.DataPoints = make([]DataPoint, 0, len(r.Values)) - for _, v := range r.Values { - dp, err := parseDataPoint(v) - if err != nil { - log.Printf("[prometheus] Skipping invalid data point: %v", err) - continue - } - series.DataPoints = append(series.DataPoints, dp) - } - } else if raw.ResultType == "vector" && r.Value != nil { - dp, err := parseDataPoint(r.Value) - if err != nil { - log.Printf("[prometheus] Skipping invalid vector data point: %v", err) - } else { - series.DataPoints = []DataPoint{dp} - } - } - - result.Series = append(result.Series, series) + p := c.getPromClient() + if p == nil { + // Concurrent Reset cleared baseURL between EnsureConnected returning + // and getPromClient — the connection was reset under us. + return nil, errors.New("prometheus connection was reset") } - - return result, nil + return p.QueryRange(ctx, query, start, end, step) } -func parseDataPoint(v []interface{}) (DataPoint, error) { - if len(v) != 2 { - return DataPoint{}, fmt.Errorf("expected 2 elements, got %d", len(v)) - } - - // Timestamp can be float64 or json.Number - var ts float64 - switch t := v[0].(type) { - case float64: - ts = t - case json.Number: - var err error - ts, err = t.Float64() - if err != nil { - return DataPoint{}, fmt.Errorf("parsing timestamp: %w", err) - } - default: - return DataPoint{}, fmt.Errorf("unexpected timestamp type: %T", v[0]) - } - - // Value is always a string in Prometheus responses - valStr, ok := v[1].(string) - if !ok { - return DataPoint{}, fmt.Errorf("expected string value, got %T", v[1]) +// Query executes a Prometheus instant query via the underlying pkg/prom.Client. +func (c *Client) Query(ctx context.Context, query string) (*prom.QueryResult, error) { + if _, _, err := c.EnsureConnected(ctx); err != nil { + return nil, err } - val, err := strconv.ParseFloat(valStr, 64) - if err != nil { - return DataPoint{}, fmt.Errorf("parsing value %q: %w", valStr, err) + p := c.getPromClient() + if p == nil { + return nil, errors.New("prometheus connection was reset") } - - return DataPoint{ - Timestamp: int64(ts), - Value: val, - }, nil + return p.Query(ctx, query) } diff --git a/internal/prometheus/client_test.go b/internal/prometheus/client_test.go index 83fb76c11..910644f0c 100644 --- a/internal/prometheus/client_test.go +++ b/internal/prometheus/client_test.go @@ -87,7 +87,7 @@ func TestProbe(t *testing.T) { } } -func TestHeadersOnQuery(t *testing.T) { +func TestHeadersOnProbe(t *testing.T) { var gotAuth, gotOrg atomic.Value gotAuth.Store("") gotOrg.Store("") @@ -96,8 +96,6 @@ func TestHeadersOnQuery(t *testing.T) { gotAuth.Store(r.Header.Get("Authorization")) gotOrg.Store(r.Header.Get("X-Scope-OrgID")) w.WriteHeader(http.StatusOK) - // One result so probe()'s "empty instance" check passes; doQuery - // doesn't care about the body shape for this test. _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"job":"prometheus"},"value":[1700000000,"1"]}]}}`)) })) defer srv.Close() @@ -110,8 +108,8 @@ func TestHeadersOnQuery(t *testing.T) { }, } - if _, err := c.doQuery(context.Background(), srv.URL+"/api/v1/query?query=up"); err != nil { - t.Fatalf("doQuery failed: %v", err) + if !c.probe(context.Background(), srv.URL) { + t.Fatal("probe() returned false for healthy server") } if got := gotAuth.Load().(string); got != "Bearer test-token" { t.Errorf("Authorization header = %q, want %q", got, "Bearer test-token") @@ -119,16 +117,6 @@ func TestHeadersOnQuery(t *testing.T) { if got := gotOrg.Load().(string); got != "tenant-7" { t.Errorf("X-Scope-OrgID header = %q, want %q", got, "tenant-7") } - - // probe() must carry the same headers — otherwise discovery would 401 - // against an auth-protected endpoint before any real query runs. - gotAuth.Store("") - if !c.probe(context.Background(), srv.URL) { - t.Fatal("probe() returned false for healthy server") - } - if got := gotAuth.Load().(string); got != "Bearer test-token" { - t.Errorf("probe Authorization header = %q, want %q", got, "Bearer test-token") - } } func TestHeadersNoneWhenUnset(t *testing.T) { @@ -139,13 +127,13 @@ func TestHeadersNoneWhenUnset(t *testing.T) { sawAuth.Store(true) } w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"job":"prometheus"},"value":[1700000000,"1"]}]}}`)) })) defer srv.Close() c := &Client{httpClient: &http.Client{Timeout: 5 * time.Second}} - if _, err := c.doQuery(context.Background(), srv.URL+"/api/v1/query?query=up"); err != nil { - t.Fatalf("doQuery failed: %v", err) + if !c.probe(context.Background(), srv.URL) { + t.Fatal("probe() returned false for healthy server") } if sawAuth.Load() { t.Error("Authorization header sent when none configured") diff --git a/internal/prometheus/discovery.go b/internal/prometheus/discovery.go index 4994903c5..10cceede8 100644 --- a/internal/prometheus/discovery.go +++ b/internal/prometheus/discovery.go @@ -4,76 +4,27 @@ import ( "context" "fmt" "log" - "sort" "strings" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/skyhook-io/radar/internal/errorlog" "github.com/skyhook-io/radar/internal/portforward" + "github.com/skyhook-io/radar/pkg/prom" ) -// Well-known Prometheus/VictoriaMetrics service locations -// (similar to traffic/caretta.go but with different ordering for workload metrics discovery). -var wellKnownLocations = []struct { - namespace string - name string - port int // 0 = use service's first port - basePath string // sub-path for Prometheus API -}{ - // VictoriaMetrics — monitoring namespace first (workload metrics) - {"monitoring", "victoria-metrics-victoria-metrics-single-server", 8428, ""}, - {"monitoring", "victoria-metrics-single-server", 8428, ""}, - {"monitoring", "vmsingle", 8428, ""}, - {"monitoring", "vmselect", 8481, "/select/0/prometheus"}, - {"victoria-metrics", "victoria-metrics-victoria-metrics-single-server", 8428, ""}, - {"victoria-metrics", "victoria-metrics-single-server", 8428, ""}, - {"victoria-metrics", "vmsingle", 8428, ""}, - {"victoria-metrics", "vmselect", 8481, "/select/0/prometheus"}, - // kube-prometheus-stack - {"monitoring", "kube-prometheus-stack-prometheus", 9090, ""}, - {"monitoring", "prometheus-kube-prometheus-prometheus", 9090, ""}, - {"monitoring", "prometheus-operated", 9090, ""}, - // Standard Prometheus - {"opencost", "prometheus-server", 0, ""}, - {"monitoring", "prometheus-server", 0, ""}, - {"prometheus", "prometheus-server", 0, ""}, - {"observability", "prometheus-server", 0, ""}, - {"metrics", "prometheus-server", 0, ""}, - {"kube-system", "prometheus", 0, ""}, - {"default", "prometheus", 0, ""}, - // VictoriaMetrics — caretta namespace (traffic-specific, may lack workload metrics) - {"caretta", "caretta-vm", 8428, ""}, -} - -// Namespaces commonly used for metrics services -var metricsNamespaces = map[string]bool{ - "monitoring": true, - "prometheus": true, - "observability": true, - "metrics": true, - "victoria-metrics": true, - "caretta": true, - "opencost": true, -} - -// Namespaces to skip during dynamic discovery -var skipNamespaces = map[string]bool{ - "kube-public": true, - "kube-node-lease": true, -} - // discover finds and connects to Prometheus using a multi-layer approach: // 1. Manual URL override (--prometheus-url) // 2. Existing traffic system port-forward -// 3. Well-known service locations -// 4. Dynamic cluster-wide discovery with scoring +// 3. Well-known service locations (via pkg/prom.Discover) +// 4. Dynamic cluster-wide discovery with scoring (via pkg/prom.Discover) +// +// Well-known + dynamic candidate enumeration lives in pkg/prom.Discover so +// it can be shared by any consumer of the package. This function owns +// Radar's port-forward fallback, which is only needed when Radar runs +// outside the cluster and can't reach in-cluster Service DNS directly. // // The lock is only held briefly to read/write state, not during network I/O. func (c *Client) discover(ctx context.Context) (string, string, error) { - // Layer 1: Manual URL override (read under lock) + // Layer 1: Manual URL override c.mu.RLock() manualURL := c.manualURL contextName := c.contextName @@ -91,7 +42,7 @@ func (c *Client) discover(ctx context.Context) (string, string, error) { return "", "", fmt.Errorf("manual Prometheus URL %s not reachable", addr) } - // Layer 2: Check if traffic system already has a port-forward + // Layer 2: Reuse traffic system's existing port-forward if present if pfAddr := portforward.GetAddress(contextName); pfAddr != "" { if c.probe(ctx, pfAddr) { log.Printf("[prometheus] Using traffic system port-forward: %s", pfAddr) @@ -104,324 +55,97 @@ func (c *Client) discover(ctx context.Context) (string, string, error) { return "", "", fmt.Errorf("no Kubernetes client available for discovery") } - // Layer 3: Well-known service locations — try each reachable candidate - candidates := c.findWellKnownServices(ctx) - if len(candidates) > 0 { - log.Printf("[prometheus] Found %d well-known service(s), probing...", len(candidates)) + // Layers 3 + 4: Enumerate candidates via the shared pkg/prom discovery + // logic. Well-known first, then dynamic fallbacks. + candidates, err := prom.Discover(ctx, k8sClient, prom.DiscoverOptions{ + IncludeDynamic: true, + Logger: func(format string, args ...interface{}) { + log.Printf("[prometheus] "+format, args...) + }, + }) + if err != nil { + log.Printf("[prometheus] Discover error: %v", err) } - - for _, info := range candidates { - if c.probe(ctx, info.clusterAddr+info.basePath) { - log.Printf("[prometheus] Connected to %s/%s at %s", info.namespace, info.name, info.clusterAddr) - c.setDiscoveryService(info) - c.markConnected(info.clusterAddr, info.basePath) - return info.clusterAddr, info.basePath, nil - } - log.Printf("[prometheus] Well-known service %s/%s not reachable in-cluster, trying next...", info.namespace, info.name) + if len(candidates) == 0 { + errorlog.Record("prometheus", "warning", "no Prometheus service found in cluster") + return "", "", fmt.Errorf("no Prometheus service found in cluster") } - // If well-known services exist but none reachable in-cluster, try port-forward on first candidate - if len(candidates) > 0 { - info := candidates[0] - log.Printf("[prometheus] No well-known service reachable in-cluster, trying port-forward to %s/%s...", info.namespace, info.name) - c.setDiscoveryService(info) + log.Printf("[prometheus] Found %d candidate(s), probing...", len(candidates)) - connInfo, pfErr := portforward.Start(ctx, info.namespace, info.name, info.targetPort, contextName) - if pfErr == nil { - addr := connInfo.Address - if c.probe(ctx, addr+info.basePath) { - c.markConnected(addr, info.basePath) - return addr, info.basePath, nil - } - log.Printf("[prometheus] Well-known service %s/%s not responding after port-forward, falling back to dynamic discovery", info.namespace, info.name) - portforward.Stop() - } else { - errorlog.Record("prometheus", "error", "port-forward to %s/%s failed: %v", info.namespace, info.name, pfErr) + // First pass: probe each candidate at its in-cluster address. Works when + // radar is running in-cluster OR when the user's shell can route to the + // cluster DNS (rare, but cheap to try). + for _, cand := range candidates { + addr := cand.ClusterAddr + cand.BasePath + if c.probe(ctx, addr) { + log.Printf("[prometheus] Connected to %s/%s at %s (source=%s, score=%d)", + cand.Namespace, cand.Name, cand.ClusterAddr, cand.Source, cand.Score) + c.setDiscoveryServiceFromCandidate(cand) + c.markConnected(cand.ClusterAddr, cand.BasePath) + return cand.ClusterAddr, cand.BasePath, nil } } - // Layer 4: Dynamic discovery - info := c.discoverDynamic(ctx) - if info == nil { - c.mu.Lock() - c.discoveryService = nil - c.mu.Unlock() - errorlog.Record("prometheus", "warning", "no Prometheus service found in cluster") - return "", "", fmt.Errorf("no Prometheus service found in cluster") - } + // Fallback: try port-forwarding candidates in priority order. This path is + // normally reached when Radar runs outside the cluster, where in-cluster + // Service DNS cannot resolve from the user's machine. + var lastErr error + for _, cand := range candidates { + log.Printf("[prometheus] No candidate reachable in-cluster, starting port-forward to %s/%s...", + cand.Namespace, cand.Name) + c.setDiscoveryServiceFromCandidate(cand) - c.setDiscoveryService(info) + connInfo, pfErr := portforward.Start(ctx, cand.Namespace, cand.Name, cand.TargetPort, contextName) + if pfErr != nil { + lastErr = fmt.Errorf("port-forward to %s/%s failed: %w", cand.Namespace, cand.Name, pfErr) + errorlog.Record("prometheus", "error", "port-forward to %s/%s failed: %v", cand.Namespace, cand.Name, pfErr) + continue + } - if c.probe(ctx, info.clusterAddr+info.basePath) { - log.Printf("[prometheus] Connected to %s/%s at %s (dynamic)", info.namespace, info.name, info.clusterAddr) - c.markConnected(info.clusterAddr, info.basePath) - return info.clusterAddr, info.basePath, nil - } + addr := connInfo.Address + if c.probe(ctx, addr+cand.BasePath) { + c.markConnected(addr, cand.BasePath) + return addr, cand.BasePath, nil + } - log.Printf("[prometheus] Service %s/%s not reachable in-cluster, starting port-forward...", info.namespace, info.name) - connInfo, err := portforward.Start(ctx, info.namespace, info.name, info.targetPort, contextName) - if err != nil { - errorlog.Record("prometheus", "error", "port-forward to %s/%s failed: %v", info.namespace, info.name, err) - return "", "", fmt.Errorf("port-forward to %s/%s failed: %w", info.namespace, info.name, err) + portforward.Stop() + lastErr = fmt.Errorf("Prometheus at %s/%s not responding after port-forward", cand.Namespace, cand.Name) + errorlog.Record("prometheus", "error", "Prometheus at %s/%s not responding after port-forward", cand.Namespace, cand.Name) } - addr := connInfo.Address - if c.probe(ctx, addr+info.basePath) { - c.markConnected(addr, info.basePath) - return addr, info.basePath, nil + c.mu.Lock() + c.discoveryService = nil + c.mu.Unlock() + if lastErr != nil { + return "", "", lastErr } - - portforward.Stop() - errorlog.Record("prometheus", "error", "Prometheus at %s/%s not responding after port-forward", info.namespace, info.name) - return "", "", fmt.Errorf("Prometheus at %s/%s not responding after port-forward", info.namespace, info.name) + return "", "", fmt.Errorf("no Prometheus service found in cluster") } -// setDiscoveryService records the discovered service metadata under write lock. -func (c *Client) setDiscoveryService(info *serviceInfo) { +// setDiscoveryServiceFromCandidate records the discovered service metadata +// from a pkg/prom.Candidate. +func (c *Client) setDiscoveryServiceFromCandidate(cand prom.Candidate) { c.mu.Lock() - c.discoveryService = &ServiceInfo{ - Namespace: info.namespace, - Name: info.name, - Port: info.port, - BasePath: info.basePath, + c.discoveryService = &prom.ServiceInfo{ + Namespace: cand.Namespace, + Name: cand.Name, + Port: cand.Port, + BasePath: cand.BasePath, } c.mu.Unlock() } -// markConnected records the active connection and marks discovery as complete. +// markConnected records the active connection and marks discovery as +// complete. Also clears any cached pkg/prom.Client so the next +// getPromClient rebuilds against the (possibly new) address — otherwise +// a stale cached client could survive a discovery that landed on a +// different endpoint. func (c *Client) markConnected(addr, basePath string) { c.mu.Lock() c.baseURL = addr c.basePath = basePath + c.prom = nil c.discovered = true c.mu.Unlock() } - -type serviceInfo struct { - namespace string - name string - port int // service port (for cluster-internal address) - targetPort int // container port (for port-forwarding to pod) - clusterAddr string - basePath string -} - -func (c *Client) findWellKnownServices(ctx context.Context) []*serviceInfo { - c.mu.RLock() - k8sClient := c.k8sClient - c.mu.RUnlock() - - var results []*serviceInfo - for _, loc := range wellKnownLocations { - svc, err := k8sClient.CoreV1().Services(loc.namespace).Get(ctx, loc.name, metav1.GetOptions{}) - if err != nil { - if !apierrors.IsNotFound(err) { - log.Printf("[prometheus] Error checking well-known service %s/%s: %v", loc.namespace, loc.name, err) - } - continue - } - - port := resolvePort(*svc, loc.port) - addr := buildClusterAddr(svc.Name, svc.Namespace, svc.Spec.ClusterIP, port) - tp := resolveTargetPort(*svc, port) - - log.Printf("[prometheus] Found well-known service: %s/%s:%d (targetPort=%d)", svc.Namespace, svc.Name, port, tp) - results = append(results, &serviceInfo{ - namespace: svc.Namespace, - name: svc.Name, - port: port, - targetPort: tp, - clusterAddr: addr, - basePath: loc.basePath, - }) - } - return results -} - -type scoredCandidate struct { - info serviceInfo - score int -} - -func (c *Client) discoverDynamic(ctx context.Context) *serviceInfo { - log.Printf("[prometheus] Starting dynamic discovery...") - - c.mu.RLock() - k8sClient := c.k8sClient - c.mu.RUnlock() - - svcs, err := k8sClient.CoreV1().Services("").List(ctx, metav1.ListOptions{}) - if err != nil { - log.Printf("[prometheus] Failed to list services: %v", err) - return nil - } - - var candidates []scoredCandidate - for _, svc := range svcs.Items { - score, bp := scoreService(svc) - if score <= 0 { - continue - } - port := resolvePort(svc, 0) - candidates = append(candidates, scoredCandidate{ - info: serviceInfo{ - namespace: svc.Namespace, - name: svc.Name, - port: port, - targetPort: resolveTargetPort(svc, port), - clusterAddr: buildClusterAddr(svc.Name, svc.Namespace, svc.Spec.ClusterIP, port), - basePath: bp, - }, - score: score, - }) - } - - if len(candidates) == 0 { - log.Printf("[prometheus] Dynamic discovery found no candidates") - return nil - } - - sort.Slice(candidates, func(i, j int) bool { - return candidates[i].score > candidates[j].score - }) - - limit := min(len(candidates), 5) - log.Printf("[prometheus] Found %d candidates, top %d:", len(candidates), limit) - for i := range limit { - log.Printf("[prometheus] %s/%s (score=%d)", candidates[i].info.namespace, candidates[i].info.name, candidates[i].score) - } - - // Validate top candidates (no lock held during probes) - for i := range limit { - cand := &candidates[i] - addr := cand.info.clusterAddr - - if c.probe(ctx, addr+cand.info.basePath) { - log.Printf("[prometheus] Validated: %s/%s", cand.info.namespace, cand.info.name) - return &cand.info - } - } - - // Return best unvalidated candidate (caller will port-forward) - best := &candidates[0] - log.Printf("[prometheus] No candidates reachable in-cluster, returning best: %s/%s (score=%d)", - best.info.namespace, best.info.name, best.score) - return &best.info -} - -// scoreService computes a heuristic score for a service being Prometheus-compatible. -func scoreService(svc corev1.Service) (score int, basePath string) { - labels := svc.Labels - name := svc.Name - ns := svc.Namespace - - if svc.Spec.Type == corev1.ServiceTypeExternalName { - return 0, "" - } - if skipNamespaces[ns] { - return 0, "" - } - - // Label signals - appName := labels["app.kubernetes.io/name"] - appLabel := labels["app"] - component := labels["app.kubernetes.io/component"] - - switch appName { - case "prometheus": - score += 100 - case "victoria-metrics-single", "vmsingle": - score += 100 - case "vmselect": - score += 90 - basePath = "/select/0/prometheus" - case "thanos-query", "thanos-querier": - score += 80 - } - - switch appLabel { - case "prometheus", "prometheus-server": - score += 80 - case "vmsingle": - score += 80 - case "vmselect": - score += 80 - basePath = "/select/0/prometheus" - } - - if score > 0 && component == "server" { - score += 20 - } - - // Port signals - for _, p := range svc.Spec.Ports { - switch p.Port { - case 9090: // Prometheus default - score += 30 - case 8428: // VictoriaMetrics single-node default - score += 30 - case 8481: // VictoriaMetrics vmselect default - score += 25 - case 9009: // Thanos Query default - score += 25 - } - if strings.Contains(strings.ToLower(p.Name), "prometheus") { - score += 10 - } - } - - // Name signals - nameLower := strings.ToLower(name) - if strings.Contains(nameLower, "prometheus") { - score += 20 - } - if strings.Contains(nameLower, "victoria") || strings.Contains(nameLower, "vmsingle") || strings.Contains(nameLower, "vmselect") { - score += 20 - if strings.Contains(nameLower, "vmselect") && basePath == "" { - basePath = "/select/0/prometheus" - } - } - if strings.Contains(nameLower, "thanos") { - score += 15 - } - - // Namespace signal - if metricsNamespaces[ns] { - score += 10 - } - - return score, basePath -} - -func resolvePort(svc corev1.Service, defaultPort int) int { - if defaultPort != 0 { - return defaultPort - } - if len(svc.Spec.Ports) > 0 { - return int(svc.Spec.Ports[0].Port) - } - return 80 -} - -// resolveTargetPort returns the container port for port-forwarding. -// When the service port differs from the container's targetPort (e.g., service:80 → container:9090), -// port-forwarding needs the container port since it bypasses the Service and connects directly to the pod. -func resolveTargetPort(svc corev1.Service, servicePort int) int { - for _, p := range svc.Spec.Ports { - if int(p.Port) == servicePort { - if p.TargetPort.IntVal > 0 { - return int(p.TargetPort.IntVal) - } - // targetPort unset or zero defaults to the service port - return servicePort - } - } - return servicePort -} - -func buildClusterAddr(name, namespace, clusterIP string, port int) string { - if clusterIP == "None" { - return fmt.Sprintf("http://%s-0.%s.%s.svc.cluster.local:%d", name, name, namespace, port) - } - return fmt.Sprintf("http://%s.%s.svc.cluster.local:%d", name, namespace, port) -} diff --git a/internal/prometheus/handlers.go b/internal/prometheus/handlers.go index 8b223b406..a7239423a 100644 --- a/internal/prometheus/handlers.go +++ b/internal/prometheus/handlers.go @@ -6,13 +6,13 @@ import ( "fmt" "log" "net/http" - "net/url" "strings" "time" "github.com/go-chi/chi/v5" "github.com/skyhook-io/radar/internal/errorlog" "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/prom" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" ) @@ -46,14 +46,17 @@ func writeError(w http.ResponseWriter, status int, msg string) { func handleStatus(w http.ResponseWriter, r *http.Request) { client := GetClient() if client == nil { - writeJSON(w, http.StatusOK, Status{Available: false, Error: "Prometheus client not initialized"}) + writeJSON(w, http.StatusOK, prom.Status{Available: false, Error: "Prometheus client not initialized"}) return } writeJSON(w, http.StatusOK, client.GetStatus()) } -// handleConnect triggers Prometheus discovery and connection. -// Accepts optional "url" query param to override discovery with a specific endpoint. +// handleConnect triggers Prometheus discovery and connection. The endpoint +// has no body or query parameters — the Prometheus URL is configured at +// process startup via --prometheus-url, never per-request. Accepting a URL +// here would let any caller redirect Prometheus queries to an arbitrary +// host (SSRF) since radar binds to 0.0.0.0 by default. func handleConnect(w http.ResponseWriter, r *http.Request) { client := GetClient() if client == nil { @@ -61,16 +64,6 @@ func handleConnect(w http.ResponseWriter, r *http.Request) { return } - // Allow URL override via query param (resets existing connection) - if overrideURL := r.URL.Query().Get("url"); overrideURL != "" { - u, err := url.Parse(overrideURL) - if err != nil || (u.Scheme != "http" && u.Scheme != "https") { - writeError(w, http.StatusBadRequest, "invalid URL: must be a valid HTTP(S) URL") - return - } - client.SetURL(overrideURL) - } - _, _, err := client.EnsureConnected(r.Context()) if err != nil { log.Printf("[prometheus] Connection failed: %v", err) @@ -136,10 +129,10 @@ type ResourceMetricsResponse struct { Kind string `json:"kind"` Namespace string `json:"namespace,omitempty"` Name string `json:"name"` - Category MetricCategory `json:"category"` + Category prom.MetricCategory `json:"category"` Unit string `json:"unit"` Range string `json:"range"` - Result *QueryResult `json:"result"` + Result *prom.QueryResult `json:"result"` Query string `json:"query,omitempty"` // PromQL query used (included when result is empty for diagnostics) Hint string `json:"hint,omitempty"` // Contextual hint when results are empty (e.g. cri-docker label issues) } @@ -157,14 +150,14 @@ func handleResourceMetrics(w http.ResponseWriter, r *http.Request) { namespace := chi.URLParam(r, "namespace") name := chi.URLParam(r, "name") - category := MetricCategory(r.URL.Query().Get("category")) + category := prom.MetricCategory(r.URL.Query().Get("category")) if category == "" { - category = CategoryCPU + category = prom.CategoryCPU } // Validate kind is supported supported := false - for _, k := range SupportedKinds() { + for _, k := range prom.SupportedKinds() { if strings.EqualFold(k, kind) { kind = k // normalize casing supported = true @@ -177,7 +170,7 @@ func handleResourceMetrics(w http.ResponseWriter, r *http.Request) { } // Validate category - validCategories := CategoriesForKind(kind) + validCategories := prom.CategoriesForKind(kind) categoryValid := false for _, c := range validCategories { if c == category { @@ -190,7 +183,7 @@ func handleResourceMetrics(w http.ResponseWriter, r *http.Request) { return } - query := BuildQuery(kind, namespace, name, category) + query := prom.BuildQuery(kind, namespace, name, category) if query == "" { writeError(w, http.StatusBadRequest, "cannot build query for "+kind+"/"+string(category)) return @@ -201,22 +194,22 @@ func handleResourceMetrics(w http.ResponseWriter, r *http.Request) { result, err := client.QueryRange(r.Context(), query, start, end, step) if err != nil { - log.Printf("[prometheus] Query failed for %s/%s/%s (%s): %v", kind, namespace, name, category, err) - errorlog.Record("prometheus", "error", "query failed for %s/%s/%s (%s): %v", kind, namespace, name, category, err) + log.Printf("[prometheus] Query failed for %q/%q/%q (%q): %v", kind, namespace, name, category, err) + errorlog.Record("prometheus", "error", "query failed for %q/%q/%q (%q): %v", kind, namespace, name, category, err) writeError(w, http.StatusBadGateway, "Prometheus query failed: "+err.Error()) return } result, query = retryWithoutContainerFilter(r.Context(), client, result, query, category, start, end, step, - func() string { return BuildQueryNoContainerFilter(kind, namespace, name, category) }, - fmt.Sprintf("Primary query empty for %s/%s/%s (%s)", kind, namespace, name, category)) + func() string { return prom.BuildQueryNoContainerFilter(kind, namespace, name, category) }, + fmt.Sprintf("Primary query empty for %q/%q/%q (%q)", kind, namespace, name, category)) resp := ResourceMetricsResponse{ Kind: kind, Namespace: namespace, Name: name, Category: category, - Unit: CategoryUnitForKind(kind, category), + Unit: prom.CategoryUnitForKind(kind, category), Range: rangeStr, Result: result, } @@ -225,8 +218,8 @@ func handleResourceMetrics(w http.ResponseWriter, r *http.Request) { if len(result.Series) == 0 { resp.Query = query resp.Hint = detectCRIDockerHint(kind, namespace, name) - log.Printf("[prometheus] Empty result for %s/%s/%s (%s), query: %s", kind, namespace, name, category, query) - errorlog.Record("prometheus", "warning", "empty result for %s/%s/%s (%s), query: %s", kind, namespace, name, category, query) + log.Printf("[prometheus] Empty result for %q/%q/%q (%q), query: %q", kind, namespace, name, category, query) + errorlog.Record("prometheus", "warning", "empty result for %q/%q/%q (%q), query: %q", kind, namespace, name, category, query) } writeJSON(w, http.StatusOK, resp) } @@ -249,12 +242,12 @@ func handleClusterScopedResourceMetrics(w http.ResponseWriter, r *http.Request) } kind = "Node" - category := MetricCategory(r.URL.Query().Get("category")) + category := prom.MetricCategory(r.URL.Query().Get("category")) if category == "" { - category = CategoryCPU + category = prom.CategoryCPU } - validCategories := CategoriesForKind(kind) + validCategories := prom.CategoriesForKind(kind) categoryValid := false for _, c := range validCategories { if c == category { @@ -267,7 +260,7 @@ func handleClusterScopedResourceMetrics(w http.ResponseWriter, r *http.Request) return } - query := BuildQuery(kind, "", name, category) + query := prom.BuildQuery(kind, "", name, category) if query == "" { writeError(w, http.StatusBadRequest, "cannot build query for "+kind+"/"+string(category)) return @@ -278,8 +271,8 @@ func handleClusterScopedResourceMetrics(w http.ResponseWriter, r *http.Request) result, err := client.QueryRange(r.Context(), query, start, end, step) if err != nil { - log.Printf("[prometheus] Query failed for %s/%s (%s): %v", kind, name, category, err) - errorlog.Record("prometheus", "error", "query failed for %s/%s (%s): %v", kind, name, category, err) + log.Printf("[prometheus] Query failed for %q/%q (%q): %v", kind, name, category, err) + errorlog.Record("prometheus", "error", "query failed for %q/%q (%q): %v", kind, name, category, err) writeError(w, http.StatusBadGateway, "Prometheus query failed: "+err.Error()) return } @@ -288,14 +281,14 @@ func handleClusterScopedResourceMetrics(w http.ResponseWriter, r *http.Request) Kind: kind, Name: name, Category: category, - Unit: CategoryUnitForKind(kind, category), + Unit: prom.CategoryUnitForKind(kind, category), Range: rangeStr, Result: result, } if len(result.Series) == 0 { resp.Query = query - log.Printf("[prometheus] Empty result for %s/%s (%s), query: %s", kind, name, category, query) - errorlog.Record("prometheus", "warning", "empty result for %s/%s (%s), query: %s", kind, name, category, query) + log.Printf("[prometheus] Empty result for %q/%q (%q), query: %q", kind, name, category, query) + errorlog.Record("prometheus", "warning", "empty result for %q/%q (%q), query: %q", kind, name, category, query) } writeJSON(w, http.StatusOK, resp) } @@ -303,10 +296,10 @@ func handleClusterScopedResourceMetrics(w http.ResponseWriter, r *http.Request) // NamespaceMetricsResponse is the response shape for namespace-level metrics. type NamespaceMetricsResponse struct { Namespace string `json:"namespace"` - Category MetricCategory `json:"category"` + Category prom.MetricCategory `json:"category"` Unit string `json:"unit"` Range string `json:"range"` - Result *QueryResult `json:"result"` + Result *prom.QueryResult `json:"result"` } // handleNamespaceMetrics returns aggregate metrics for a namespace. @@ -318,12 +311,12 @@ func handleNamespaceMetrics(w http.ResponseWriter, r *http.Request) { } namespace := chi.URLParam(r, "namespace") - category := MetricCategory(r.URL.Query().Get("category")) + category := prom.MetricCategory(r.URL.Query().Get("category")) if category == "" { - category = CategoryCPU + category = prom.CategoryCPU } - query := BuildNamespaceQuery(namespace, category) + query := prom.BuildNamespaceQuery(namespace, category) if query == "" { writeError(w, http.StatusBadRequest, "unsupported category for namespace: "+string(category)) return @@ -334,20 +327,20 @@ func handleNamespaceMetrics(w http.ResponseWriter, r *http.Request) { result, err := client.QueryRange(r.Context(), query, start, end, step) if err != nil { - log.Printf("[prometheus] Namespace query failed for %s (%s): %v", namespace, category, err) - errorlog.Record("prometheus", "error", "namespace query failed for %s (%s): %v", namespace, category, err) + log.Printf("[prometheus] Namespace query failed for %q (%q): %v", namespace, category, err) + errorlog.Record("prometheus", "error", "namespace query failed for %q (%q): %v", namespace, category, err) writeError(w, http.StatusBadGateway, "Prometheus query failed: "+err.Error()) return } result, _ = retryWithoutContainerFilter(r.Context(), client, result, query, category, start, end, step, - func() string { return BuildNamespaceQueryNoContainerFilter(namespace, category) }, - fmt.Sprintf("Namespace query empty for %s (%s)", namespace, category)) + func() string { return prom.BuildNamespaceQueryNoContainerFilter(namespace, category) }, + fmt.Sprintf("Namespace query empty for %q (%q)", namespace, category)) writeJSON(w, http.StatusOK, NamespaceMetricsResponse{ Namespace: namespace, Category: category, - Unit: CategoryUnit(category), + Unit: prom.CategoryUnit(category), Range: rangeStr, Result: result, }) @@ -355,10 +348,10 @@ func handleNamespaceMetrics(w http.ResponseWriter, r *http.Request) { // ClusterMetricsResponse is the response shape for cluster-level metrics. type ClusterMetricsResponse struct { - Category MetricCategory `json:"category"` + Category prom.MetricCategory `json:"category"` Unit string `json:"unit"` Range string `json:"range"` - Result *QueryResult `json:"result"` + Result *prom.QueryResult `json:"result"` } // handleClusterMetrics returns aggregate metrics for the entire cluster. @@ -369,12 +362,12 @@ func handleClusterMetrics(w http.ResponseWriter, r *http.Request) { return } - category := MetricCategory(r.URL.Query().Get("category")) + category := prom.MetricCategory(r.URL.Query().Get("category")) if category == "" { - category = CategoryCPU + category = prom.CategoryCPU } - query := BuildClusterQuery(category) + query := prom.BuildClusterQuery(category) if query == "" { writeError(w, http.StatusBadRequest, "unsupported category for cluster: "+string(category)) return @@ -385,19 +378,19 @@ func handleClusterMetrics(w http.ResponseWriter, r *http.Request) { result, err := client.QueryRange(r.Context(), query, start, end, step) if err != nil { - log.Printf("[prometheus] Cluster query failed (%s): %v", category, err) - errorlog.Record("prometheus", "error", "cluster query failed (%s): %v", category, err) + log.Printf("[prometheus] Cluster query failed (%q): %v", category, err) + errorlog.Record("prometheus", "error", "cluster query failed (%q): %v", category, err) writeError(w, http.StatusBadGateway, "Prometheus query failed: "+err.Error()) return } result, _ = retryWithoutContainerFilter(r.Context(), client, result, query, category, start, end, step, - func() string { return BuildClusterQueryNoContainerFilter(category) }, - fmt.Sprintf("Cluster query empty (%s)", category)) + func() string { return prom.BuildClusterQueryNoContainerFilter(category) }, + fmt.Sprintf("Cluster query empty (%q)", category)) writeJSON(w, http.StatusOK, ClusterMetricsResponse{ Category: category, - Unit: CategoryUnit(category), + Unit: prom.CategoryUnit(category), Range: rangeStr, Result: result, }) @@ -449,8 +442,8 @@ func handleRawQuery(w http.ResponseWriter, r *http.Request) { // when the primary result is empty and the category uses that filter. This handles // cri-docker and other setups where cAdvisor metrics lack the container label. // Returns the updated result (original or fallback) and the query that produced it. -func retryWithoutContainerFilter(ctx context.Context, client *Client, result *QueryResult, query string, category MetricCategory, start, end time.Time, step time.Duration, buildFallback func() string, logPrefix string) (*QueryResult, string) { - if len(result.Series) > 0 || !categoryUsesContainerFilter(category) { +func retryWithoutContainerFilter(ctx context.Context, client *Client, result *prom.QueryResult, query string, category prom.MetricCategory, start, end time.Time, step time.Duration, buildFallback func() string, logPrefix string) (*prom.QueryResult, string) { + if len(result.Series) > 0 || !prom.CategoryUsesContainerFilter(category) { return result, query } fallbackQuery := buildFallback() diff --git a/internal/prometheus/queries_test.go b/internal/prometheus/queries_test.go index ca7fd1480..bfc67fd3f 100644 --- a/internal/prometheus/queries_test.go +++ b/internal/prometheus/queries_test.go @@ -3,6 +3,8 @@ package prometheus import ( "strings" "testing" + + "github.com/skyhook-io/radar/pkg/prom" ) func TestMemoryQueriesDedupeScrapeJobsBeforeSumming(t *testing.T) { @@ -13,22 +15,22 @@ func TestMemoryQueriesDedupeScrapeJobsBeforeSumming(t *testing.T) { }{ { name: "pod", - query: BuildQuery("Pod", "dify-new", "dify-new-postgresql-primary-0", CategoryMemory), + query: prom.BuildQuery("Pod", "dify-new", "dify-new-postgresql-primary-0", prom.CategoryMemory), want: "sum by (pod,namespace) (max by (pod,namespace,container)", }, { name: "workload", - query: BuildQuery("StatefulSet", "dify-new", "dify-new-postgresql-primary", CategoryMemory), + query: prom.BuildQuery("StatefulSet", "dify-new", "dify-new-postgresql-primary", prom.CategoryMemory), want: "sum by (pod,namespace) (max by (pod,namespace,container)", }, { name: "namespace", - query: BuildNamespaceQuery("dify-new", CategoryMemory), + query: prom.BuildNamespaceQuery("dify-new", prom.CategoryMemory), want: "sum(max by (namespace,pod,container)", }, { name: "cluster", - query: BuildClusterQuery(CategoryMemory), + query: prom.BuildClusterQuery(prom.CategoryMemory), want: "sum(max by (namespace,pod,container)", }, } diff --git a/internal/prometheus/rightsizing.go b/internal/prometheus/rightsizing.go index 0390210c2..ec52d51db 100644 --- a/internal/prometheus/rightsizing.go +++ b/internal/prometheus/rightsizing.go @@ -11,6 +11,7 @@ import ( "github.com/go-chi/chi/v5" "github.com/skyhook-io/radar/internal/errorlog" "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/prom" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -305,9 +306,9 @@ func computeRightsizingRow(ctx context.Context, client *Client, namespace, workl // queryContainerP95 returns the P95 of a container's CPU/memory usage over the // rightsizing window. Returns nil (no error) when there's no data. func queryContainerP95(ctx context.Context, client *Client, namespace, workload, container, resKind string) (*float64, error) { - ns := SanitizeLabelValue(namespace) - podPattern := fmt.Sprintf("%s-.*", escapeRegexMeta(SanitizeLabelValue(workload))) - cn := SanitizeLabelValue(container) + ns := prom.SanitizeLabelValue(namespace) + podPattern := fmt.Sprintf("%s-.*", prom.EscapeRegexMeta(prom.SanitizeLabelValue(workload))) + cn := prom.SanitizeLabelValue(container) windowSec := int64(rightsizingWindow.Seconds()) var query string @@ -520,8 +521,8 @@ func handlePVCUsage(w http.ResponseWriter, r *http.Request) { return } - ns := SanitizeLabelValue(namespace) - pvc := SanitizeLabelValue(name) + ns := prom.SanitizeLabelValue(namespace) + pvc := prom.SanitizeLabelValue(name) // kubelet's native label is `persistentvolumeclaim`; clusters with custom // relabeling that renamed it will return no series and the gauge hides. @@ -560,7 +561,7 @@ func handlePVCUsage(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, resp) } -func firstValue(res *QueryResult) *float64 { +func firstValue(res *prom.QueryResult) *float64 { if res == nil || len(res.Series) == 0 || len(res.Series[0].DataPoints) == 0 { return nil } diff --git a/internal/traffic/caretta.go b/internal/traffic/caretta.go index 0505b684f..e1035049b 100644 --- a/internal/traffic/caretta.go +++ b/internal/traffic/caretta.go @@ -19,7 +19,7 @@ import ( "github.com/skyhook-io/radar/internal/errorlog" "github.com/skyhook-io/radar/internal/portforward" - promclient "github.com/skyhook-io/radar/internal/prometheus" + "github.com/skyhook-io/radar/pkg/prom" ) const ( @@ -336,7 +336,7 @@ func (c *CarettaSource) queryPrometheusForFlows(ctx context.Context, promAddr st query := "caretta_links_observed" if opts.Namespace != "" { // Filter by namespace (either client or server) - safeNS := promclient.SanitizeLabelValue(opts.Namespace) + safeNS := prom.SanitizeLabelValue(opts.Namespace) query = fmt.Sprintf(`caretta_links_observed{client_namespace="%s"} or caretta_links_observed{server_namespace="%s"}`, safeNS, safeNS) } diff --git a/internal/traffic/istio.go b/internal/traffic/istio.go index 9a74ea7d2..4cff077d0 100644 --- a/internal/traffic/istio.go +++ b/internal/traffic/istio.go @@ -12,6 +12,7 @@ import ( "github.com/skyhook-io/radar/internal/portforward" promclient "github.com/skyhook-io/radar/internal/prometheus" + "github.com/skyhook-io/radar/pkg/prom" ) const ( @@ -153,7 +154,7 @@ func (s *IstioSource) queryHTTPFlows(ctx context.Context, client *promclient.Cli // Main query: all requests, no response_code grouping query := `sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, request_protocol, reporter) (rate(istio_requests_total{reporter="destination"}[5m]))` if opts.Namespace != "" { - safeNS := promclient.SanitizeLabelValue(opts.Namespace) + safeNS := prom.SanitizeLabelValue(opts.Namespace) query = fmt.Sprintf(`sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, request_protocol, reporter) (rate(istio_requests_total{reporter="destination", source_workload_namespace="%s"}[5m])) or sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, request_protocol, reporter) (rate(istio_requests_total{reporter="destination", destination_workload_namespace="%s"}[5m]))`, safeNS, safeNS) } @@ -161,7 +162,7 @@ func (s *IstioSource) queryHTTPFlows(ctx context.Context, client *promclient.Cli // Error query: 5xx only errorQuery := `sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, reporter) (rate(istio_requests_total{reporter="destination", response_code=~"5.."}[5m]))` if opts.Namespace != "" { - safeNS := promclient.SanitizeLabelValue(opts.Namespace) + safeNS := prom.SanitizeLabelValue(opts.Namespace) errorQuery = fmt.Sprintf(`sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, reporter) (rate(istio_requests_total{reporter="destination", response_code=~"5..", source_workload_namespace="%s"}[5m])) or sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, reporter) (rate(istio_requests_total{reporter="destination", response_code=~"5..", destination_workload_namespace="%s"}[5m]))`, safeNS, safeNS) } @@ -294,14 +295,14 @@ func (s *IstioSource) queryByteMetrics(ctx context.Context, client *promclient.C sentQuery := `sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_request_bytes_sum{reporter="destination"}[5m]))` recvQuery := `sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_response_bytes_sum{reporter="destination"}[5m]))` if opts.Namespace != "" { - safeNS := promclient.SanitizeLabelValue(opts.Namespace) + safeNS := prom.SanitizeLabelValue(opts.Namespace) sentQuery = fmt.Sprintf(`sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_request_bytes_sum{reporter="destination", source_workload_namespace="%s"}[5m])) or sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_request_bytes_sum{reporter="destination", destination_workload_namespace="%s"}[5m]))`, safeNS, safeNS) recvQuery = fmt.Sprintf(`sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_response_bytes_sum{reporter="destination", source_workload_namespace="%s"}[5m])) or sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace) (rate(istio_response_bytes_sum{reporter="destination", destination_workload_namespace="%s"}[5m]))`, safeNS, safeNS) } - parseByteResult := func(result *promclient.QueryResult, target map[flowKey]float64) { + parseByteResult := func(result *prom.QueryResult, target map[flowKey]float64) { if result == nil { return } @@ -345,7 +346,7 @@ func (s *IstioSource) queryByteMetrics(ctx context.Context, client *promclient.C func (s *IstioSource) queryTCPFlows(ctx context.Context, client *promclient.Client, opts FlowOptions) ([]Flow, error) { query := `sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, reporter) (rate(istio_tcp_connections_opened_total{reporter="destination"}[5m]))` if opts.Namespace != "" { - safeNS := promclient.SanitizeLabelValue(opts.Namespace) + safeNS := prom.SanitizeLabelValue(opts.Namespace) query = fmt.Sprintf(`sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, reporter) (rate(istio_tcp_connections_opened_total{reporter="destination", source_workload_namespace="%s"}[5m])) or sum by (source_workload, source_workload_namespace, destination_workload, destination_workload_namespace, destination_service_name, reporter) (rate(istio_tcp_connections_opened_total{reporter="destination", destination_workload_namespace="%s"}[5m]))`, safeNS, safeNS) } diff --git a/pkg/opencost/compute.go b/pkg/opencost/compute.go new file mode 100644 index 000000000..e134c4139 --- /dev/null +++ b/pkg/opencost/compute.go @@ -0,0 +1,509 @@ +package opencost + +import ( + "context" + "log" + "math" + "sort" + "strconv" + "strings" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// windowHours parses an OpenCost window string (e.g. "1h", "24h", "7d", +// "30d") into a number of hours. OpenCost's /allocation returns totalCost +// summed over the whole window; to present an hourly rate (which then +// multiplies by 730 for monthly projection) we divide by this value. Falls +// back to 1.0 for unknown inputs so callers degrade gracefully rather than +// silently zero out costs. +func windowHours(w string) float64 { + s := strings.TrimSpace(strings.ToLower(w)) + if s == "" { + return 1 + } + if len(s) < 2 { + return 1 + } + unit := s[len(s)-1] + numStr := s[:len(s)-1] + n, err := strconv.ParseFloat(numStr, 64) + if err != nil || n <= 0 { + return 1 + } + switch unit { + case 'h': + return n + case 'd': + return n * 24 + case 'w': + return n * 24 * 7 + case 'm': + // Ambiguous (minutes vs months). OpenCost uses "30d" for months, so + // treat lone "m" as minutes for safety. + return n / 60 + } + return 1 +} + +// SummaryOptions tunes ComputeCostSummary behavior. +type SummaryOptions struct { + // Currency label returned in the response (default "USD"). + Currency string + + // Window passed to OpenCost and echoed in the response (default "1h"). + // For PromQL paths this is a response label only; the query itself has + // fixed time windows baked in. For REST paths it's forwarded to OpenCost. + Window string + + // Aggregate controls how rows are grouped. "namespace" (default), + // "controller", "pod". Passed straight to OpenCost's aggregate param. + Aggregate string + + // Filter is an OpenCost /allocation filter expression (v1.106+). + // Commonly used to scope pod/controller queries to a single namespace. + // Example: `namespace:"kube-system"` + Filter string + + // NamespaceFilter is a client-side namespace scope applied after the + // OpenCost response is received. Set alongside Filter — older OpenCost + // versions silently ignore the REST `filter` param, so we have to post- + // filter rows by their Properties["namespace"] to actually honor the + // drill-down scope. + NamespaceFilter string +} + +// ComputeCostSummary is the default compute path: asks OpenCost's REST API +// for namespace-level allocation over the window and maps the response into +// our normalized CostSummary. +// +// Why REST by default: OpenCost computes cost internally (cloud pricing + +// Kubernetes allocation data) and exposes the results two ways — REST at +// /allocation/assets/cloudCost and Prometheus metrics at /metrics. REST +// works wherever OpenCost works; the Prometheus path requires a scrape +// config that's often missing on clusters where OpenCost was installed +// manually. REST is also simpler (one pre-aggregated call instead of ~6 +// PromQL queries + client-side math). +// +// When to reach for ComputeCostSummaryFromProm instead: +// - You need custom label aggregations beyond what /allocation exposes. +// - You want per-node hourly pricing as time series. +// - You're correlating cost with live Prometheus metrics (deploy events, +// HPA state, container_cpu_usage, etc.) in the same query. +// +// Contract: +// - REST unreachable or returns error → Available=false, Reason=ReasonQueryError. +// - REST returns empty data (OpenCost up but has no cost rows yet) → +// Available=false, Reason=ReasonNoMetrics. +// - Otherwise Available=true with namespace rows + totals filled in. +// - Numbers rounded to 4dp for JSON cleanliness. +func ComputeCostSummary(ctx context.Context, client *RESTClient, opts SummaryOptions) *CostSummary { + if opts.Currency == "" { + opts.Currency = "USD" + } + if opts.Window == "" { + opts.Window = "1h" + } + + aggregate := opts.Aggregate + if aggregate == "" { + aggregate = "namespace" + } + resp, err := client.GetAllocation(ctx, AllocationOptions{ + Window: opts.Window, + Aggregate: aggregate, + Filter: opts.Filter, + IncludeIdle: true, + }) + if err != nil { + log.Printf("[opencost] /allocation summary failed: %v", err) + return &CostSummary{Available: false, Reason: ReasonQueryError} + } + if resp == nil || len(resp.Data) == 0 { + return &CostSummary{Available: false, Reason: ReasonNoMetrics} + } + + // /allocation returns an array of time windows. For a single bucket we + // merge across all windows; normally there's just one. + // + // Older OpenCost versions (< v1.106) silently ignore the REST filter param, + // so when NamespaceFilter is set we post-filter rows by their + // Properties["namespace"]. The __idle__ synthetic row has no namespace, so + // it naturally drops out of a scoped drill-down — desired. + combined := make(map[string]*Allocation) + for _, bucket := range resp.Data { + for name, a := range bucket { + if a == nil { + continue + } + if opts.NamespaceFilter != "" { + ns, _ := a.Properties["namespace"].(string) + if ns != opts.NamespaceFilter { + continue + } + } + if existing, ok := combined[name]; ok { + existing.CPUCost += a.CPUCost + existing.RAMCost += a.RAMCost + existing.PVCost += a.PVCost + existing.NetworkCost += a.NetworkCost + existing.LoadBalancerCost += a.LoadBalancerCost + existing.SharedCost += a.SharedCost + existing.ExternalCost += a.ExternalCost + existing.TotalCost += a.TotalCost + existing.CPUCoreUsageAverage += a.CPUCoreUsageAverage + existing.RAMByteUsageAverage += a.RAMByteUsageAverage + } else { + cp := *a + combined[name] = &cp + } + } + } + + if len(combined) == 0 { + return &CostSummary{Available: false, Reason: ReasonNoMetrics} + } + + namespaces := make([]NamespaceCost, 0, len(combined)) + var totalHourlyCost, totalStorageCost, totalNetworkCost, totalIdleCost float64 + var totalAllocCost, totalUsageCost float64 + + for name, a := range combined { + // OpenCost emits __idle__ as a synthetic row for unallocated node + // capacity. Surface it as a dedicated idle total, not a namespace. + // + // Sign quirk: OpenCost can report __idle__ with negative costs when + // the cluster's allocated sum over-counts relative to node pricing + // (burstable workloads exceeding their request, or pricing-model + // rounding). Clamp negative idle to 0 — idle is conceptually + // "unused capacity cost", always non-negative. + if name == "__idle__" { + idle := a.CPUCost + a.RAMCost + if idle < 0 { + idle = 0 + } + totalIdleCost += idle + // Intentionally do NOT add __idle__ to totalHourlyCost — + // totalHourlyCost is the sum of allocated spend. Idle is + // surfaced separately as TotalIdleCost so callers can render + // or sum it as needed. + continue + } + // OpenCost aggregates orphan pods (those with no controller) into a + // synthetic "__unallocated__" row when grouping by controller. On some + // cluster configurations this row also absorbs cluster-level idle, + // making it appear larger than the parent namespace. Drop it to keep + // the drill-down consistent — named controllers tell the real story. + if name == "__unallocated__" { + continue + } + nc := NamespaceCost{ + Name: name, + Kind: aggregate, + CPUCost: a.CPUCost, + MemoryCost: a.RAMCost, + StorageCost: a.PVCost, + NetworkCost: a.NetworkCost, + HourlyCost: a.TotalCost, + } + // For non-namespace aggregates, OpenCost stamps the parent namespace + // in Properties so the UI can thread children under their parent + // without a second query. + if aggregate != "namespace" { + if ns, ok := a.Properties["namespace"].(string); ok { + nc.Namespace = ns + } + } + allocCost := nc.CPUCost + nc.MemoryCost + if a.TotalEfficiency > 0 && allocCost > 0 { + // Cap per-row efficiency at 1.0 BEFORE accumulating into the + // cluster total. OpenCost occasionally reports TotalEfficiency + // > 1 (burstable pods exceeding their request, measurement + // noise); without this cap a single outlier could push the + // cluster total above 100%. + rowEff := a.TotalEfficiency + if rowEff > 1 { + rowEff = 1 + } + usageCost := rowEff * allocCost + nc.CPUUsageCost = usageCost * safeRatio(nc.CPUCost, allocCost) + nc.MemoryUsageCost = usageCost - nc.CPUUsageCost + nc.Efficiency = efficiencyPct(usageCost, allocCost) + nc.IdleCost = idleFromUsage(usageCost, allocCost) + // Accumulate cost-weighted, matching ComputeCostSummaryFromProm. + // An unweighted mean would let a $0.01 row at 10% efficiency + // drag down the cluster number identically to a $100 row. + totalAllocCost += allocCost + totalUsageCost += usageCost + } + totalHourlyCost += nc.HourlyCost + totalStorageCost += nc.StorageCost + totalNetworkCost += nc.NetworkCost + // Per-namespace idle (allocated-not-used) is separate from the + // __idle__ row (unassigned node capacity). Both are real waste the + // user can act on, so aggregate them together. + totalIdleCost += nc.IdleCost + namespaces = append(namespaces, nc) + } + + sort.Slice(namespaces, func(i, j int) bool { + return namespaces[i].HourlyCost > namespaces[j].HourlyCost + }) + + clusterEfficiency := efficiencyPct(totalUsageCost, totalAllocCost) + + // Normalize window-total to hourly. OpenCost's /allocation returns + // totalCost summed over the entire window; we want rate so the UI can + // multiply by 730 for monthly projections regardless of the window + // picker state. Efficiency is unitless (usage/alloc ratio) so it does + // not need normalization. + hours := windowHours(opts.Window) + if hours <= 0 { + hours = 1 + } + normalize := func(v float64) float64 { return v / hours } + totalHourlyCost = normalize(totalHourlyCost) + totalStorageCost = normalize(totalStorageCost) + totalNetworkCost = normalize(totalNetworkCost) + totalIdleCost = normalize(totalIdleCost) + for i := range namespaces { + namespaces[i].HourlyCost = normalize(namespaces[i].HourlyCost) + namespaces[i].CPUCost = normalize(namespaces[i].CPUCost) + namespaces[i].MemoryCost = normalize(namespaces[i].MemoryCost) + namespaces[i].StorageCost = normalize(namespaces[i].StorageCost) + namespaces[i].NetworkCost = normalize(namespaces[i].NetworkCost) + namespaces[i].CPUUsageCost = normalize(namespaces[i].CPUUsageCost) + namespaces[i].MemoryUsageCost = normalize(namespaces[i].MemoryUsageCost) + namespaces[i].IdleCost = normalize(namespaces[i].IdleCost) + } + + // Round everything for JSON stability. + totalHourlyCost = roundTo(totalHourlyCost, 4) + totalStorageCost = roundTo(totalStorageCost, 4) + totalNetworkCost = roundTo(totalNetworkCost, 4) + totalIdleCost = roundTo(totalIdleCost, 4) + for i := range namespaces { + namespaces[i].HourlyCost = roundTo(namespaces[i].HourlyCost, 4) + namespaces[i].CPUCost = roundTo(namespaces[i].CPUCost, 4) + namespaces[i].MemoryCost = roundTo(namespaces[i].MemoryCost, 4) + namespaces[i].StorageCost = roundTo(namespaces[i].StorageCost, 4) + namespaces[i].NetworkCost = roundTo(namespaces[i].NetworkCost, 4) + namespaces[i].CPUUsageCost = roundTo(namespaces[i].CPUUsageCost, 4) + namespaces[i].MemoryUsageCost = roundTo(namespaces[i].MemoryUsageCost, 4) + namespaces[i].IdleCost = roundTo(namespaces[i].IdleCost, 4) + } + + return &CostSummary{ + Available: true, + Currency: opts.Currency, + Window: opts.Window, + TotalHourlyCost: totalHourlyCost, + TotalStorageCost: totalStorageCost, + TotalNetworkCost: totalNetworkCost, + TotalIdleCost: totalIdleCost, + ClusterEfficiency: clusterEfficiency, + Namespaces: namespaces, + } +} + +// safeRatio returns num/den or 0 when den is non-positive. +func safeRatio(num, den float64) float64 { + if den <= 0 { + return 0 + } + return num / den +} + +// ComputeCostSummaryFromProm is the PromQL-based compute path, for callers +// that have a scraped-OpenCost Prometheus available rather than the REST +// API (or that need to correlate cost with live Prometheus metrics in the +// same query). +// +// Contract: +// - If the primary OpenCost allocation metrics are absent entirely, the +// returned summary has Available=false and Reason=ReasonNoMetrics. +// - If the underlying query fails outright, Available=false and +// Reason=ReasonQueryError. Errors are never returned — callers serve +// the typed reason to the UI. +// - Numbers are rounded to 4 decimal places for cleaner JSON. +func ComputeCostSummaryFromProm(ctx context.Context, client *prom.Client, opts SummaryOptions) *CostSummary { + if client == nil { + return &CostSummary{Available: false, Reason: ReasonNoPrometheus} + } + if opts.Currency == "" { + opts.Currency = "USD" + } + if opts.Window == "" { + opts.Window = "1h" + } + + cpuResult, err := client.Query(ctx, + `sum by (namespace) (label_replace(avg_over_time(container_cpu_allocation{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") * on(node) group_left() node_cpu_hourly_cost)`) + if err != nil { + log.Printf("[opencost] CPU allocation query failed, trying opencost_container_cpu_cost_total: %v", err) + cpuResult, err = client.Query(ctx, + `sum by (namespace) (label_replace(rate(opencost_container_cpu_cost_total[1h]), "namespace", "$1", "exported_namespace", "(.+)"))`) + if err != nil { + log.Printf("[opencost] CPU allocation fallback query also failed: %v", err) + return &CostSummary{Available: false, Reason: ReasonQueryError} + } + } + + memResult, err := client.Query(ctx, + `sum by (namespace) (label_replace(avg_over_time(container_memory_allocation_bytes{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) + if err != nil { + log.Printf("[opencost] memory allocation query failed, trying opencost_container_memory_cost_total: %v", err) + memResult, err = client.Query(ctx, + `sum by (namespace) (label_replace(rate(opencost_container_memory_cost_total[1h]), "namespace", "$1", "exported_namespace", "(.+)"))`) + if err != nil { + log.Printf("[opencost] memory allocation fallback query also failed: %v", err) + return &CostSummary{Available: false, Reason: ReasonQueryError} + } + } + + if len(cpuResult.Series) == 0 && len(memResult.Series) == 0 { + return &CostSummary{Available: false, Reason: ReasonNoMetrics} + } + + // Usage queries are best-effort: efficiency / idle are derived from them + // and zero out cleanly if the queries fail, but a silent failure here can + // look identical to a low-utilization workload — so log when it happens. + cpuUsageRes, cpuUsageErr := client.Query(ctx, + `sum by (namespace) (label_replace(rate(container_cpu_usage_seconds_total{container!="", namespace!=""}[1h]), "node", "$1", "instance", "(.+?)(?::\\d+)?$") * on(node) group_left() node_cpu_hourly_cost)`) + if cpuUsageErr != nil { + log.Printf("[opencost] CPU usage query failed (efficiency will be 0 for affected rows): %v", cpuUsageErr) + } + cpuUsageMap := lastValuePerLabel(cpuUsageRes, cpuUsageErr, "namespace") + + memUsageRes, memUsageErr := client.Query(ctx, + `sum by (namespace) (label_replace(container_memory_working_set_bytes{container!="", namespace!=""}, "node", "$1", "instance", "(.+?)(?::\\d+)?$") / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) + if memUsageErr != nil { + log.Printf("[opencost] memory usage query failed (efficiency will be 0 for affected rows): %v", memUsageErr) + } + memUsageMap := lastValuePerLabel(memUsageRes, memUsageErr, "namespace") + + storageRes, storageErr := client.Query(ctx, + `sum by (namespace) (pv_hourly_cost * on(persistentvolume) group_left(namespace) kube_persistentvolume_claim_ref)`) + if storageErr != nil { + log.Printf("[opencost] storage cost query failed (storage costs will be 0): %v", storageErr) + } + storageMap := lastValuePerLabel(storageRes, storageErr, "namespace") + + nsMap := make(map[string]*NamespaceCost) + mergeSeriesIntoNamespaceField(cpuResult, nsMap, func(nc *NamespaceCost, v float64) { nc.CPUCost = v }) + mergeSeriesIntoNamespaceField(memResult, nsMap, func(nc *NamespaceCost, v float64) { nc.MemoryCost = v }) + + var totalHourlyCost, totalStorageCost, totalUsageCost, totalAllocCost float64 + namespaces := make([]NamespaceCost, 0, len(nsMap)) + for _, nc := range nsMap { + nc.HourlyCost = nc.CPUCost + nc.MemoryCost + nc.StorageCost = storageMap[nc.Name] + nc.HourlyCost += nc.StorageCost + totalStorageCost += nc.StorageCost + + nc.CPUUsageCost = cpuUsageMap[nc.Name] + nc.MemoryUsageCost = memUsageMap[nc.Name] + allocCost := nc.CPUCost + nc.MemoryCost + usageCost := nc.CPUUsageCost + nc.MemoryUsageCost + nc.Efficiency = efficiencyPct(usageCost, allocCost) + nc.IdleCost = idleFromUsage(usageCost, allocCost) + totalAllocCost += allocCost + totalUsageCost += usageCost + totalHourlyCost += nc.HourlyCost + namespaces = append(namespaces, *nc) + } + + if nodeResult, err := client.Query(ctx, `sum(node_total_hourly_cost)`); err == nil && len(nodeResult.Series) > 0 && len(nodeResult.Series[0].DataPoints) > 0 { + if nodeCost := nodeResult.Series[0].DataPoints[0].Value; nodeCost > totalHourlyCost { + totalHourlyCost = nodeCost + } + } + + sort.Slice(namespaces, func(i, j int) bool { + return namespaces[i].HourlyCost > namespaces[j].HourlyCost + }) + + clusterEfficiency := efficiencyPct(totalUsageCost, totalAllocCost) + totalIdleCost := idleFromUsage(totalUsageCost, totalAllocCost) + + totalHourlyCost = roundTo(totalHourlyCost, 4) + totalStorageCost = roundTo(totalStorageCost, 4) + totalIdleCost = roundTo(totalIdleCost, 4) + for i := range namespaces { + namespaces[i].HourlyCost = roundTo(namespaces[i].HourlyCost, 4) + namespaces[i].CPUCost = roundTo(namespaces[i].CPUCost, 4) + namespaces[i].MemoryCost = roundTo(namespaces[i].MemoryCost, 4) + namespaces[i].StorageCost = roundTo(namespaces[i].StorageCost, 4) + namespaces[i].CPUUsageCost = roundTo(namespaces[i].CPUUsageCost, 4) + namespaces[i].MemoryUsageCost = roundTo(namespaces[i].MemoryUsageCost, 4) + namespaces[i].IdleCost = roundTo(namespaces[i].IdleCost, 4) + } + + return &CostSummary{ + Available: true, + Currency: opts.Currency, + Window: opts.Window, + TotalHourlyCost: totalHourlyCost, + TotalStorageCost: totalStorageCost, + TotalIdleCost: totalIdleCost, + ClusterEfficiency: clusterEfficiency, + Namespaces: namespaces, + } +} + +func mergeSeriesIntoNamespaceField(result *prom.QueryResult, nsMap map[string]*NamespaceCost, set func(*NamespaceCost, float64)) { + if result == nil { + return + } + for _, s := range result.Series { + ns := s.Labels["namespace"] + if ns == "" { + continue + } + nc, ok := nsMap[ns] + if !ok { + nc = &NamespaceCost{Name: ns} + nsMap[ns] = nc + } + if len(s.DataPoints) > 0 { + set(nc, s.DataPoints[len(s.DataPoints)-1].Value) + } + } +} + +// roundTo rounds to `places` decimal places, returning 0 for NaN/Inf +// to keep JSON responses stable. +func roundTo(val float64, places int) float64 { + if math.IsNaN(val) || math.IsInf(val, 0) { + return 0 + } + pow := math.Pow(10, float64(places)) + return math.Round(val*pow) / pow +} + +// efficiencyPct returns 100 * usage / alloc rounded to 1 decimal, +// clamped to [0, 100]. Returns 0 when usage or alloc is non-positive +// (treated as "no data" — distinct from "100% idle"). +func efficiencyPct(usage, alloc float64) float64 { + if usage <= 0 || alloc <= 0 { + return 0 + } + eff := roundTo((usage/alloc)*100, 1) + if eff > 100 { + eff = 100 + } + return eff +} + +// idleFromUsage returns max(alloc - usage, 0) but only when both are +// positive. Mirrors efficiencyPct's "no data ≠ 100% idle" semantics. +func idleFromUsage(usage, alloc float64) float64 { + if usage <= 0 || alloc <= 0 { + return 0 + } + idle := alloc - usage + if idle < 0 { + return 0 + } + return idle +} diff --git a/pkg/opencost/compute_rest_test.go b/pkg/opencost/compute_rest_test.go new file mode 100644 index 000000000..d98b4d4f8 --- /dev/null +++ b/pkg/opencost/compute_rest_test.go @@ -0,0 +1,282 @@ +package opencost + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "net/url" + "testing" +) + +// fakeOpenCost returns a RESTClient backed by a httptest server that serves +// canned JSON for /allocation. Caller provides the raw response body. +func fakeOpenCost(t *testing.T, bodyForAllocation string) *RESTClient { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/allocation": + _, _ = w.Write([]byte(bodyForAllocation)) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(srv.Close) + tr := &httpTransport{baseURL: srv.URL, client: srv.Client()} + return NewRESTClient(tr) +} + +// httpTransport is a minimal Transport backed by net/http for tests. +type httpTransport struct { + baseURL string + client *http.Client +} + +func (t *httpTransport) Do(ctx context.Context, method, path string, params url.Values) ([]byte, error) { + u := t.baseURL + path + if len(params) > 0 { + u = u + "?" + params.Encode() + } + req, err := http.NewRequestWithContext(ctx, method, u, nil) + if err != nil { + return nil, err + } + resp, err := t.client.Do(req) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + buf := make([]byte, 0, 4096) + tmp := make([]byte, 4096) + for { + n, err := resp.Body.Read(tmp) + if n > 0 { + buf = append(buf, tmp[:n]...) + } + if err != nil { + break + } + } + return buf, nil +} + +func (t *httpTransport) Address() string { return t.baseURL } + +// buildAllocationResponse builds a valid OpenCost /allocation body from a +// namespace→totalCost map, filling CPU and RAM with a 60/40 split so the +// test can verify splits too. Efficiency defaults to 50%. +func buildAllocationResponse(t *testing.T, rows map[string]float64) string { + t.Helper() + window := make(map[string]*Allocation, len(rows)) + for ns, total := range rows { + window[ns] = &Allocation{ + Name: ns, + CPUCost: total * 0.6, + RAMCost: total * 0.4, + TotalCost: total, + TotalEfficiency: 0.5, + } + } + resp := AllocationResponse{ + Code: 200, + Data: []map[string]*Allocation{window}, + } + b, err := json.Marshal(resp) + if err != nil { + t.Fatal(err) + } + return string(b) +} + +func TestComputeCostSummary_REST_HappyPath(t *testing.T) { + body := buildAllocationResponse(t, map[string]float64{ + "checkout": 5.00, + "payments": 2.00, + "user-svc": 0.75, + }) + client := fakeOpenCost(t, body) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + if !got.Available { + t.Fatalf("expected Available=true; got %+v", got) + } + if got.Currency != "USD" || got.Window != "1h" { + t.Errorf("defaults not applied: currency=%q window=%q", got.Currency, got.Window) + } + if len(got.Namespaces) != 3 { + t.Fatalf("want 3 namespaces, got %d", len(got.Namespaces)) + } + // Sorted by HourlyCost desc. + if got.Namespaces[0].Name != "checkout" { + t.Errorf("want checkout first, got %s", got.Namespaces[0].Name) + } + if got.Namespaces[0].HourlyCost != 5.00 { + t.Errorf("checkout HourlyCost=%v, want 5.00", got.Namespaces[0].HourlyCost) + } + if got.Namespaces[0].CPUCost != 3.00 { // 60% of 5 + t.Errorf("checkout CPUCost=%v, want 3.00", got.Namespaces[0].CPUCost) + } + // Efficiency 50% roundtrip + if got.Namespaces[0].Efficiency != 50 { + t.Errorf("efficiency=%v, want 50", got.Namespaces[0].Efficiency) + } + // Cluster totals: sum of 5+2+0.75 = 7.75 + if got.TotalHourlyCost != 7.75 { + t.Errorf("TotalHourlyCost=%v, want 7.75", got.TotalHourlyCost) + } +} + +func TestComputeCostSummary_REST_IdleRowSurfaced(t *testing.T) { + // OpenCost emits __idle__ for unallocated node capacity. We surface it + // as TotalIdleCost (not a namespace row), and do NOT roll it into + // TotalHourlyCost — total hourly is the sum of *allocated* spend, so + // the UI can render idle as a separate cell without double-counting. + window := map[string]*Allocation{ + "checkout": {Name: "checkout", CPUCost: 1.0, RAMCost: 0.5, TotalCost: 1.5, TotalEfficiency: 0.6}, + "__idle__": {Name: "__idle__", CPUCost: 0.8, RAMCost: 0.2, TotalCost: 1.0}, + } + body, _ := json.Marshal(AllocationResponse{Code: 200, Data: []map[string]*Allocation{window}}) + client := fakeOpenCost(t, string(body)) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + if !got.Available { + t.Fatal("want Available=true") + } + // TotalIdleCost is the sum of __idle__ (1.0, cluster-level unused + // capacity) + per-namespace idle (checkout: alloc 1.5 × (1 - eff 0.6) + // = 0.6). The UI surfaces both together as "waste". + if got.TotalIdleCost != 1.6 { + t.Errorf("TotalIdleCost=%v, want 1.6 (__idle__ 1.0 + checkout ns-idle 0.6)", got.TotalIdleCost) + } + for _, ns := range got.Namespaces { + if ns.Name == "__idle__" { + t.Error("__idle__ must not appear as a regular namespace row") + } + } + // Allocated-only total = 1.5 for checkout; __idle__ excluded. + if got.TotalHourlyCost != 1.5 { + t.Errorf("TotalHourlyCost=%v, want 1.5 (allocated only; __idle__ goes to TotalIdleCost)", got.TotalHourlyCost) + } +} + +func TestComputeCostSummary_REST_NegativeIdleClampedToZero(t *testing.T) { + // Real-world: OpenCost can report a negative __idle__ totalCost when + // burstable workloads over-consume vs node pricing. The __idle__ + // contribution clamps to 0; per-namespace idle (positive, from + // under-utilization) still counts in the total. + window := map[string]*Allocation{ + "app": {Name: "app", CPUCost: 0.5, RAMCost: 0.1, TotalCost: 0.6, TotalEfficiency: 0.4}, + "__idle__": {Name: "__idle__", CPUCost: -0.3, RAMCost: -0.1}, + } + body, _ := json.Marshal(AllocationResponse{Code: 200, Data: []map[string]*Allocation{window}}) + client := fakeOpenCost(t, string(body)) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + // Expect: __idle__ clamped to 0, app ns-idle = 0.6 × (1 - 0.4) = 0.36. + if got.TotalIdleCost != 0.36 { + t.Errorf("TotalIdleCost=%v, want 0.36 (__idle__ clamped, app ns-idle 0.36)", got.TotalIdleCost) + } + if got.TotalHourlyCost != 0.6 { + t.Errorf("TotalHourlyCost should still be 0.6 (allocated only); got %v", got.TotalHourlyCost) + } +} + +func TestComputeCostSummary_REST_WindowNormalization(t *testing.T) { + // OpenCost's /allocation returns totalCost summed over the whole + // window. We must divide by the window's hours to present a rate so + // the UI can multiply by 730 for monthly projection without + // ballooning the numbers when the user picks 24h / 7d / 30d. + window := map[string]*Allocation{ + "svc": {Name: "svc", CPUCost: 24.0, RAMCost: 0, TotalCost: 24.0, TotalEfficiency: 0.5}, + } + body, _ := json.Marshal(AllocationResponse{Code: 200, Data: []map[string]*Allocation{window}}) + client := fakeOpenCost(t, string(body)) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{Window: "24h"}) + if !got.Available { + t.Fatal("want Available=true") + } + // 24.0 total over 24h → $1/hr. + if got.TotalHourlyCost != 1.0 { + t.Errorf("TotalHourlyCost=%v, want 1.0 ($24 total / 24h = $1/hr)", got.TotalHourlyCost) + } + if got.Namespaces[0].HourlyCost != 1.0 { + t.Errorf("svc.HourlyCost=%v, want 1.0", got.Namespaces[0].HourlyCost) + } +} + +func TestComputeCostSummary_REST_EfficiencyCappedBeforeAveraging(t *testing.T) { + // OpenCost TotalEfficiency can exceed 1 for burstable workloads. A + // single runaway row must not dominate the fleet average. + window := map[string]*Allocation{ + "normal": {Name: "normal", CPUCost: 1.0, RAMCost: 0, TotalCost: 1.0, TotalEfficiency: 0.2}, + "burstable": {Name: "burstable", CPUCost: 1.0, RAMCost: 0, TotalCost: 1.0, TotalEfficiency: 100.0}, + } + body, _ := json.Marshal(AllocationResponse{Code: 200, Data: []map[string]*Allocation{window}}) + client := fakeOpenCost(t, string(body)) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + // Burstable capped at 100%. Normal = 20%. Mean = 60%. + if got.ClusterEfficiency < 58 || got.ClusterEfficiency > 62 { + t.Errorf("ClusterEfficiency=%v, want ~60 (cap+avg)", got.ClusterEfficiency) + } + // Per-row caps too. + for _, ns := range got.Namespaces { + if ns.Efficiency > 100 { + t.Errorf("%s efficiency=%v exceeds cap", ns.Name, ns.Efficiency) + } + } +} + +func TestComputeCostSummary_REST_NoMetricsReason(t *testing.T) { + body, _ := json.Marshal(AllocationResponse{Code: 200, Data: []map[string]*Allocation{{}}}) + client := fakeOpenCost(t, string(body)) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + if got.Available { + t.Error("expected Available=false for empty allocation data") + } + if got.Reason != ReasonNoMetrics { + t.Errorf("Reason=%q, want %q", got.Reason, ReasonNoMetrics) + } +} + +func TestComputeCostSummary_REST_QueryErrorReason(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + })) + defer srv.Close() + client := NewRESTClient(&httpTransport{baseURL: srv.URL, client: srv.Client()}) + + got := ComputeCostSummary(context.Background(), client, SummaryOptions{}) + if got.Available { + t.Error("expected Available=false on 502") + } + // Any non-2xx yields parse-failure on empty body or json error → Reason maps to query_error. + if got.Reason != ReasonQueryError { + t.Errorf("Reason=%q, want %q", got.Reason, ReasonQueryError) + } +} + +func TestComputeCostSummary_REST_ForwardsWindow(t *testing.T) { + var capturedQuery url.Values + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedQuery = r.URL.Query() + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"code":200,"data":[{}]}`)) + })) + defer srv.Close() + client := NewRESTClient(&httpTransport{baseURL: srv.URL, client: srv.Client()}) + + _ = ComputeCostSummary(context.Background(), client, SummaryOptions{Window: "7d"}) + if capturedQuery.Get("window") != "7d" { + t.Errorf("window not forwarded: got %q", capturedQuery.Get("window")) + } + if capturedQuery.Get("aggregate") != "namespace" { + t.Errorf("aggregate not set: got %q", capturedQuery.Get("aggregate")) + } + if capturedQuery.Get("includeIdle") != "true" { + t.Errorf("includeIdle not set: got %q", capturedQuery.Get("includeIdle")) + } +} diff --git a/pkg/opencost/compute_test.go b/pkg/opencost/compute_test.go new file mode 100644 index 000000000..ef658cef8 --- /dev/null +++ b/pkg/opencost/compute_test.go @@ -0,0 +1,253 @@ +package opencost + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strconv" + "strings" + "testing" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// scriptedProm returns a prom.Client backed by a httptest server that +// serves canned responses keyed by a predicate applied to the PromQL query. +// Predicates are tried in order; the first matching one wins. +func scriptedProm(t *testing.T, cases []scriptedCase) *prom.Client { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + for _, c := range cases { + if c.matches(q) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(c.body)) + return + } + } + // Default: success with empty result. + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + t.Cleanup(srv.Close) + return prom.NewClient(prom.NewHTTPTransport(srv.URL, "", nil)) +} + +type scriptedCase struct { + contains string + body string +} + +func (c scriptedCase) matches(q string) bool { + return strings.Contains(q, c.contains) +} + +// vectorBody helps build a minimal Prometheus vector response. +func vectorBody(samples map[string]float64) string { + type result struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` + } + body := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []result `json:"result"` + } `json:"data"` + }{Status: "success"} + body.Data.ResultType = "vector" + for ns, v := range samples { + body.Data.Result = append(body.Data.Result, result{ + Metric: map[string]string{"namespace": ns}, + Value: []interface{}{1700000000.0, formatFloat(v)}, + }) + } + b, _ := json.Marshal(body) + return string(b) +} + +func scalarBody(v float64) string { + type result struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` + } + body := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []result `json:"result"` + } `json:"data"` + }{Status: "success"} + body.Data.ResultType = "vector" + body.Data.Result = []result{{Metric: map[string]string{}, Value: []interface{}{1700000000.0, formatFloat(v)}}} + b, _ := json.Marshal(body) + return string(b) +} + +// formatFloat renders a value the way Prometheus does — a numeric string +// with enough precision to round-trip the test inputs exactly. +func formatFloat(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) +} + +func TestComputeCostSummary_HappyPath(t *testing.T) { + client := scriptedProm(t, []scriptedCase{ + {contains: "container_cpu_allocation", body: vectorBody(map[string]float64{"checkout": 2.0, "payments": 1.0})}, + {contains: "container_memory_allocation_bytes", body: vectorBody(map[string]float64{"checkout": 3.0, "payments": 0.5})}, + {contains: "container_cpu_usage_seconds_total", body: vectorBody(map[string]float64{"checkout": 0.8, "payments": 0.6})}, + {contains: "container_memory_working_set_bytes", body: vectorBody(map[string]float64{"checkout": 1.2, "payments": 0.25})}, + {contains: "pv_hourly_cost", body: vectorBody(map[string]float64{"checkout": 0.05})}, + {contains: "node_total_hourly_cost", body: scalarBody(8.0)}, // exceeds sum of namespaces, so it wins + }) + + got := ComputeCostSummaryFromProm(context.Background(), client, SummaryOptions{}) + if !got.Available { + t.Fatalf("summary unavailable: %+v", got) + } + if got.Currency != "USD" || got.Window != "1h" { + t.Errorf("currency/window defaults: %+v", got) + } + if got.TotalHourlyCost != 8.0 { + t.Errorf("TotalHourlyCost=%v, want 8.0 (node_total_hourly_cost ceiling)", got.TotalHourlyCost) + } + if got.TotalStorageCost != 0.05 { + t.Errorf("TotalStorageCost=%v, want 0.05", got.TotalStorageCost) + } + // totalAlloc = (2+3) + (1+0.5) = 6.5; totalUsage = (0.8+1.2) + (0.6+0.25) = 2.85 + // clusterEff = 2.85/6.5 * 100 = 43.85 → 43.8 at 1 dp + if got.ClusterEfficiency < 43 || got.ClusterEfficiency > 44 { + t.Errorf("ClusterEfficiency=%v, want ~43.8", got.ClusterEfficiency) + } + // totalIdle = 6.5 - 2.85 = 3.65 + if got.TotalIdleCost < 3.5 || got.TotalIdleCost > 3.8 { + t.Errorf("TotalIdleCost=%v, want ~3.65", got.TotalIdleCost) + } + if len(got.Namespaces) != 2 { + t.Fatalf("expected 2 namespaces, got %d", len(got.Namespaces)) + } + // Sorted by HourlyCost desc; checkout = 2+3+0.05 = 5.05 > payments = 1+0.5 = 1.5 + if got.Namespaces[0].Name != "checkout" { + t.Errorf("first namespace should be checkout (higher cost); got %s", got.Namespaces[0].Name) + } + if got.Namespaces[0].HourlyCost != 5.05 { + t.Errorf("checkout.HourlyCost=%v, want 5.05", got.Namespaces[0].HourlyCost) + } +} + +func TestComputeCostSummary_NoMetricsReason(t *testing.T) { + client := scriptedProm(t, []scriptedCase{ + // All queries return empty vector results. + }) + got := ComputeCostSummaryFromProm(context.Background(), client, SummaryOptions{}) + if got.Available { + t.Error("expected Available=false when no metrics") + } + if got.Reason != ReasonNoMetrics { + t.Errorf("Reason=%q, want %q", got.Reason, ReasonNoMetrics) + } +} + +func TestComputeCostSummary_QueryErrorReason(t *testing.T) { + // Both primary and opencost_* fallback fail with HTTP error. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + })) + defer srv.Close() + client := prom.NewClient(prom.NewHTTPTransport(srv.URL, "", nil)) + + got := ComputeCostSummaryFromProm(context.Background(), client, SummaryOptions{}) + if got.Available { + t.Error("expected Available=false on query error") + } + if got.Reason != ReasonQueryError { + t.Errorf("Reason=%q", got.Reason) + } +} + +func TestComputeCostSummary_FallsBackToOpencostMetricNames(t *testing.T) { + // First query (container_cpu_allocation) returns an error, then + // the fallback (opencost_container_cpu_cost_total) succeeds. + // + // Simulated with a counter that errors the first time and succeeds the + // second. The test uses an HTTP handler that inspects the query string + // and returns accordingly. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + switch { + case strings.Contains(q, "container_cpu_allocation"): + w.WriteHeader(http.StatusBadGateway) + case strings.Contains(q, "opencost_container_cpu_cost_total"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(vectorBody(map[string]float64{"checkout": 2.0}))) + case strings.Contains(q, "container_memory_allocation_bytes"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(vectorBody(map[string]float64{"checkout": 1.0}))) + default: + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + } + })) + defer srv.Close() + client := prom.NewClient(prom.NewHTTPTransport(srv.URL, "", nil)) + + got := ComputeCostSummaryFromProm(context.Background(), client, SummaryOptions{}) + if !got.Available { + t.Fatalf("expected Available=true with fallback metrics; %+v", got) + } + if len(got.Namespaces) != 1 || got.Namespaces[0].Name != "checkout" { + t.Errorf("unexpected namespaces: %+v", got.Namespaces) + } +} + +func TestComputeCostSummary_RoundsValues(t *testing.T) { + client := scriptedProm(t, []scriptedCase{ + {contains: "container_cpu_allocation", body: vectorBody(map[string]float64{"x": 1.123456789})}, + {contains: "container_memory_allocation_bytes", body: vectorBody(map[string]float64{"x": 2.987654321})}, + }) + got := ComputeCostSummaryFromProm(context.Background(), client, SummaryOptions{}) + if !got.Available { + t.Fatalf("summary unavailable: %+v", got) + } + nc := got.Namespaces[0] + if nc.CPUCost != 1.1235 { + t.Errorf("CPU rounding: got %v, want 1.1235", nc.CPUCost) + } + if nc.MemoryCost != 2.9877 { + t.Errorf("Memory rounding: got %v, want 2.9877", nc.MemoryCost) + } +} + +func TestWindowHours(t *testing.T) { + cases := []struct { + in string + want float64 + }{ + // Standard units + {"1h", 1}, + {"24h", 24}, + {"7d", 168}, + {"1w", 168}, + {"30d", 720}, + // Decimal hours (rare but accepted) + {"1.5h", 1.5}, + // Minutes — documented decision to treat lone "m" as minutes, + // not months. Pinned here so the windowHours("m") comment can't + // be quietly "fixed" to mean months. + {"5m", 5.0 / 60}, + // Fallbacks: empty, missing unit, parse error, non-positive + {"", 1}, + {"h", 1}, + {"-5h", 1}, + {"0h", 1}, + {"abch", 1}, + // Unknown unit + {"3y", 1}, + } + for _, tc := range cases { + got := windowHours(tc.in) + if got != tc.want { + t.Errorf("windowHours(%q) = %v, want %v", tc.in, got, tc.want) + } + } +} diff --git a/pkg/opencost/nodes.go b/pkg/opencost/nodes.go new file mode 100644 index 000000000..e3eda73d7 --- /dev/null +++ b/pkg/opencost/nodes.go @@ -0,0 +1,69 @@ +package opencost + +import ( + "context" + "log" + "sort" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// ComputeNodeCosts returns per-node hourly cost breakdown sourced from the +// OpenCost-exported Prometheus metrics (node_total_hourly_cost, +// node_cpu_hourly_cost, node_ram_hourly_cost). Sorted descending by hourly +// cost. Errors map to typed Reason values; never returned to callers because +// the HTTP layer serves them in-band. +func ComputeNodeCosts(ctx context.Context, client *prom.Client) *NodeCostResponse { + if client == nil { + return &NodeCostResponse{Available: false, Reason: ReasonNoPrometheus} + } + + totalResult, err := client.Query(ctx, `node_total_hourly_cost`) + if err != nil { + log.Printf("[opencost] node_total_hourly_cost query failed: %v", err) + return &NodeCostResponse{Available: false, Reason: ReasonQueryError} + } + if len(totalResult.Series) == 0 { + return &NodeCostResponse{Available: false, Reason: ReasonNoMetrics} + } + + cpuResult, cpuErr := client.Query(ctx, `node_cpu_hourly_cost`) + cpuMap := lastValuePerLabel(cpuResult, cpuErr, "node") + memResult, memErr := client.Query(ctx, `node_ram_hourly_cost`) + memMap := lastValuePerLabel(memResult, memErr, "node") + + nodes := make([]NodeCost, 0, len(totalResult.Series)) + for _, s := range totalResult.Series { + node := s.Labels["node"] + if node == "" || len(s.DataPoints) == 0 { + continue + } + nodes = append(nodes, NodeCost{ + Name: node, + InstanceType: s.Labels["instance_type"], + Region: s.Labels["region"], + HourlyCost: roundTo(s.DataPoints[len(s.DataPoints)-1].Value, 4), + CPUCost: roundTo(cpuMap[node], 4), + MemoryCost: roundTo(memMap[node], 4), + }) + } + + sort.Slice(nodes, func(i, j int) bool { return nodes[i].HourlyCost > nodes[j].HourlyCost }) + + return &NodeCostResponse{Available: true, Nodes: nodes} +} + +func lastValuePerLabel(result *prom.QueryResult, err error, label string) map[string]float64 { + out := make(map[string]float64) + if err != nil || result == nil { + return out + } + for _, s := range result.Series { + v := s.Labels[label] + if v == "" || len(s.DataPoints) == 0 { + continue + } + out[v] = s.DataPoints[len(s.DataPoints)-1].Value + } + return out +} diff --git a/pkg/opencost/rest_client.go b/pkg/opencost/rest_client.go new file mode 100644 index 000000000..200759ec1 --- /dev/null +++ b/pkg/opencost/rest_client.go @@ -0,0 +1,145 @@ +package opencost + +import ( + "context" + "encoding/json" + "fmt" + "net/url" +) + +// RESTClient talks to OpenCost's HTTP API via an injected Transport. +// +// Why this exists alongside the PromQL client: OpenCost computes cost +// internally (combining Kubernetes allocation data with cloud pricing), +// then exposes results two ways: +// +// 1. REST at /allocation, /assets, /cloudCost — this package's surface. +// 2. Prometheus-format metrics at /metrics — requires a scrape config +// in a reachable Prometheus instance. Covered by pkg/prom. +// +// Many clusters have (1) working but (2) not wired up (Prometheus exists +// but no scrape job for OpenCost's /metrics). REST works everywhere OpenCost +// works, so it's the default compute path. +type RESTClient struct { + t Transport +} + +// NewRESTClient wraps the given Transport. +func NewRESTClient(t Transport) *RESTClient { + return &RESTClient{t: t} +} + +// AllocationOptions controls an /allocation query. +type AllocationOptions struct { + // Window is a human-readable duration or a start/end range. Default "1h". + // Examples: "1h", "24h", "7d", "2024-01-01T00:00:00Z,2024-01-08T00:00:00Z" + Window string + + // Aggregate controls how rows are grouped. Any value OpenCost supports: + // "namespace" (default), "controller", "pod", "container", + // "cluster", "label:", etc. + Aggregate string + + // Step controls time-bucketing. "1h", "1d", "1w". Empty => single bucket. + Step string + + // IncludeIdle adds a synthetic __idle__ row representing unallocated + // node capacity. Usually "true" so the UI can surface idle cost. + IncludeIdle bool + + // IncludeSharedCost includes shared/overhead costs in the result. + IncludeSharedCost bool + + // Filter is a comma-separated OpenCost filter expression (v1.106+). + // Empty means no filter. + Filter string +} + +func (o AllocationOptions) toQuery() url.Values { + q := url.Values{} + if o.Window != "" { + q.Set("window", o.Window) + } else { + q.Set("window", "1h") + } + if o.Aggregate != "" { + q.Set("aggregate", o.Aggregate) + } + if o.Step != "" { + q.Set("step", o.Step) + } + if o.IncludeIdle { + q.Set("includeIdle", "true") + } + if o.IncludeSharedCost { + q.Set("includeSharedCost", "true") + } + if o.Filter != "" { + q.Set("filter", o.Filter) + } + return q +} + +// Allocation is the per-row allocation data OpenCost returns. Fields are +// the subset of OpenCost's schema this package's compute path consumes; +// full field list is in OpenCost's documentation. +// +// Costs are in the configured currency (USD by default) and sum to the +// given window (not per-hour unless window=1h). +type Allocation struct { + Name string `json:"name"` + Start string `json:"start,omitempty"` + End string `json:"end,omitempty"` + + CPUCores float64 `json:"cpuCores,omitempty"` + CPUCoreRequestAverage float64 `json:"cpuCoreRequestAverage,omitempty"` + CPUCoreUsageAverage float64 `json:"cpuCoreUsageAverage,omitempty"` + CPUCost float64 `json:"cpuCost,omitempty"` + + RAMBytes float64 `json:"ramBytes,omitempty"` + RAMByteRequestAverage float64 `json:"ramByteRequestAverage,omitempty"` + RAMByteUsageAverage float64 `json:"ramByteUsageAverage,omitempty"` + RAMCost float64 `json:"ramCost,omitempty"` + + GPUCount float64 `json:"gpuCount,omitempty"` + GPUCost float64 `json:"gpuCost,omitempty"` + + PVCost float64 `json:"pvCost,omitempty"` + NetworkCost float64 `json:"networkCost,omitempty"` + LoadBalancerCost float64 `json:"loadBalancerCost,omitempty"` + SharedCost float64 `json:"sharedCost,omitempty"` + ExternalCost float64 `json:"externalCost,omitempty"` + + TotalCost float64 `json:"totalCost,omitempty"` + TotalEfficiency float64 `json:"totalEfficiency,omitempty"` // 0..1 + + // Properties holds arbitrary dimension values (namespace, cluster, labels…). + // Populated per OpenCost's response shape. + Properties map[string]interface{} `json:"properties,omitempty"` +} + +// AllocationResponse is the envelope OpenCost returns from /allocation. +// The `data` field is an array of time-window dicts: each dict maps an +// aggregate row name (e.g. a namespace name, or "__idle__") → Allocation. +type AllocationResponse struct { + Code int `json:"code"` + Status string `json:"status,omitempty"` + Data []map[string]*Allocation `json:"data"` + Message string `json:"message,omitempty"` +} + +// GetAllocation issues a GET /allocation call. +func (c *RESTClient) GetAllocation(ctx context.Context, opts AllocationOptions) (*AllocationResponse, error) { + body, err := c.t.Do(ctx, "GET", "/allocation", opts.toQuery()) + if err != nil { + return nil, fmt.Errorf("opencost.GetAllocation: %w", err) + } + var resp AllocationResponse + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("opencost.GetAllocation: parse response from %s: %w", c.t.Address(), err) + } + if resp.Code != 0 && resp.Code != 200 { + return &resp, fmt.Errorf("opencost: HTTP %d: %s", resp.Code, resp.Message) + } + return &resp, nil +} diff --git a/pkg/opencost/transport.go b/pkg/opencost/transport.go new file mode 100644 index 000000000..732136c97 --- /dev/null +++ b/pkg/opencost/transport.go @@ -0,0 +1,24 @@ +package opencost + +import ( + "context" + "net/url" +) + +// Transport is the HTTP transport used by RESTClient to reach OpenCost's +// REST API. Same shape as pkg/prom.Transport (path + params in, body out) +// so a single concrete type in a caller can satisfy both interfaces. +// +// Typical implementations: direct HTTP against a known URL (in-cluster or +// kubectl port-forwarded), a tunneled proxy transport for callers that +// can't reach the cluster directly, and an httptest server in unit tests. +type Transport interface { + // Do issues a request to path (e.g. "/allocation") with query + // parameters and returns the raw response body. Non-2xx responses + // should be returned as errors so callers don't have to re-check. + Do(ctx context.Context, method, path string, params url.Values) ([]byte, error) + + // Address returns a diagnostic identifier for this transport (the + // upstream URL, or a human-readable description). + Address() string +} diff --git a/pkg/opencost/trend.go b/pkg/opencost/trend.go new file mode 100644 index 000000000..53d418b12 --- /dev/null +++ b/pkg/opencost/trend.go @@ -0,0 +1,205 @@ +package opencost + +import ( + "context" + "log" + "sort" + "time" +) + +// TrendOptions controls ComputeCostTrend. +type TrendOptions struct { + // Window is the overall time range (e.g. "7d", "30d"). Defaults to "24h". + Window string + + // Step is the bucket size inside the window. Defaults based on Window: + // 1h → 5m, 24h → 1h, 7d → 6h, 30d → 1d. If set, overrides the default. + Step string + + // Aggregate controls how rows inside each bucket are grouped. Defaults + // to "namespace" so callers can produce both total and per-namespace + // series from the same response. Use "cluster" when only the fleet-total + // line is needed (cheaper for the backend + OpenCost). + Aggregate string +} + +// ComputeCostTrend queries OpenCost's /allocation with a step parameter +// and returns a bucketed cost trend. Each CostTrendSeries becomes one line +// on the UI chart; a synthetic "__total__" series carries the cluster-level +// sum so the default view doesn't have to re-sum on the client. +// +// Contract mirrors ComputeCostSummary: +// - REST unreachable / parse error → Available=false, Reason=ReasonQueryError. +// - OpenCost responds but has no buckets → Available=false, +// Reason=ReasonNoMetrics. +// - Otherwise Available=true with one CostTrendSeries per (aggregate row) +// — always including a "__total__" aggregate — ordered by bucket +// timestamp ascending. +// +// Each data point's Value is normalized to $/hr for the bucket (OpenCost's +// per-bucket totalCost ÷ bucket duration), matching the hourly-rate +// convention used throughout the Costs UI. The UI multiplies by 730 for +// monthly projections or hours-in-period for retrospective totals. +func ComputeCostTrend(ctx context.Context, client *RESTClient, opts TrendOptions) *CostTrendResponse { + window := opts.Window + if window == "" { + window = "24h" + } + aggregate := opts.Aggregate + if aggregate == "" { + aggregate = "namespace" + } + step := opts.Step + if step == "" { + step = defaultStep(window) + } + + resp, err := client.GetAllocation(ctx, AllocationOptions{ + Window: window, + Aggregate: aggregate, + Step: step, + IncludeIdle: false, // idle is a summary concept; drop it here to keep the chart focused on spend + }) + if err != nil { + log.Printf("[opencost] /allocation trend failed (window=%s step=%s): %v", window, step, err) + return &CostTrendResponse{Available: false, Reason: ReasonQueryError, Range: window} + } + if resp == nil || len(resp.Data) == 0 { + return &CostTrendResponse{Available: false, Reason: ReasonNoMetrics, Range: window} + } + + bucketHours := windowHours(step) + skippedBuckets := 0 + + // Walk buckets in order. For each bucket, accumulate per-aggregate + // totals and the bucket timestamp (parsed from one row's Start, since + // every row in a bucket shares the same window). + seriesByName := make(map[string][]CostDataPoint) + totals := make([]CostDataPoint, 0, len(resp.Data)) + + for _, bucket := range resp.Data { + if len(bucket) == 0 { + continue + } + ts := bucketTimestamp(bucket) + if ts == 0 { + // No parseable Start on any row — skip rather than stamping all + // points at the Unix epoch, which would collapse the chart. + skippedBuckets++ + continue + } + var bucketTotal float64 + for name, a := range bucket { + if a == nil || name == "__idle__" { + continue + } + // Normalize to hourly rate for this bucket. OpenCost returns + // totalCost summed across the bucket; dividing by bucket + // duration (hours) gives the $/hr rate the UI consumes. + value := a.TotalCost / bucketHours + seriesByName[name] = append(seriesByName[name], CostDataPoint{ + Timestamp: ts, + Value: roundTo(value, 4), + }) + bucketTotal += a.TotalCost + } + totals = append(totals, CostDataPoint{ + Timestamp: ts, + Value: roundTo(bucketTotal/bucketHours, 4), + }) + } + + if skippedBuckets > 0 { + log.Printf("[opencost] trend dropped %d bucket(s) with no parseable timestamp (window=%s step=%s)", skippedBuckets, window, step) + } + + if len(totals) == 0 { + return &CostTrendResponse{Available: false, Reason: ReasonNoMetrics, Range: window} + } + + // Assemble the response. Put __total__ first so the UI can find it + // without scanning, then per-namespace series sorted by peak spend + // (descending). Non-total series are sorted so the chart's default + // stacking shows the biggest spenders consistently across refreshes. + series := make([]CostTrendSeries, 0, len(seriesByName)+1) + series = append(series, CostTrendSeries{ + Namespace: "__total__", + DataPoints: sortByTimestamp(totals), + }) + + type namedSeries struct { + name string + peak float64 + points []CostDataPoint + } + byPeak := make([]namedSeries, 0, len(seriesByName)) + for name, pts := range seriesByName { + pts = sortByTimestamp(pts) + peak := 0.0 + for _, p := range pts { + if p.Value > peak { + peak = p.Value + } + } + byPeak = append(byPeak, namedSeries{name: name, peak: peak, points: pts}) + } + sort.Slice(byPeak, func(i, j int) bool { return byPeak[i].peak > byPeak[j].peak }) + for _, s := range byPeak { + series = append(series, CostTrendSeries{ + Namespace: s.name, + DataPoints: s.points, + }) + } + + return &CostTrendResponse{ + Available: true, + Range: window, + Series: series, + } +} + +// defaultStep picks a sensible bucket size for a window. We bias toward +// fewer, coarser buckets than a typical charting library would because +// OpenCost's /allocation with step= scales roughly with bucket count — +// a 24h query at 1h step takes ~30s on a test cluster vs ~3s at 6h step. +// Callers behind short request deadlines need the response well under +// that budget. +// +// Bucket counts we target: 1h → 12, 24h → 4, 7d → 7, 30d → 15. +func defaultStep(window string) string { + hours := windowHours(window) + switch { + case hours <= 1: + return "5m" + case hours <= 24: + return "6h" + case hours <= 24*7: + return "1d" + default: + return "2d" + } +} + +// bucketTimestamp returns a Unix-seconds timestamp derived from the first +// allocation row in the bucket (each row in a bucket shares the same +// window, so any row is representative). Seconds because the PromQL trend +// path emits seconds, and both paths feed the same CostDataPoint.Timestamp +// field — the UI assumes seconds at the render layer. +func bucketTimestamp(bucket map[string]*Allocation) int64 { + for _, a := range bucket { + if a == nil { + continue + } + if a.Start != "" { + if t, err := time.Parse(time.RFC3339, a.Start); err == nil { + return t.Unix() + } + } + } + return 0 +} + +func sortByTimestamp(pts []CostDataPoint) []CostDataPoint { + sort.Slice(pts, func(i, j int) bool { return pts[i].Timestamp < pts[j].Timestamp }) + return pts +} diff --git a/pkg/opencost/trend_prom.go b/pkg/opencost/trend_prom.go new file mode 100644 index 000000000..f9f9a727e --- /dev/null +++ b/pkg/opencost/trend_prom.go @@ -0,0 +1,125 @@ +package opencost + +import ( + "context" + "log" + "sort" + "time" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// TrendPromOptions controls ComputeCostTrendFromProm. +type TrendPromOptions struct { + // Range is "6h", "24h", "7d" (default "24h"). Drives the start/end and + // step of the underlying range query and is echoed on the response. + Range string + + // MaxSeries is the top-N namespaces kept; the rest are aggregated into + // a single "other" series. Defaults to 8 when zero. + MaxSeries int +} + +// ComputeCostTrendFromProm returns a stacked per-namespace cost trend from +// OpenCost-exported Prometheus metrics. The top MaxSeries namespaces by +// latest cost are returned as individual series; the remainder is collapsed +// into a single "other" series. +// +// Contract mirrors ComputeCostSummaryFromProm: +// - Underlying range query fails → Available=false, Reason=ReasonQueryError. +// - No series returned → Available=false, Reason=ReasonNoMetrics. +func ComputeCostTrendFromProm(ctx context.Context, client *prom.Client, opts TrendPromOptions) *CostTrendResponse { + if client == nil { + return &CostTrendResponse{Available: false, Reason: ReasonNoPrometheus} + } + + start, end, step, label := resolveTrendRange(opts.Range) + maxSeries := opts.MaxSeries + if maxSeries <= 0 { + maxSeries = 8 + } + + const query = `sum by (namespace) ( + label_replace(avg_over_time(container_cpu_allocation{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") * on(node) group_left() node_cpu_hourly_cost +) + sum by (namespace) ( + label_replace(avg_over_time(container_memory_allocation_bytes{namespace!=""}[1h]), "namespace", "$1", "exported_namespace", "(.+)") / 1073741824 * on(node) group_left() node_ram_hourly_cost +)` + + result, err := client.QueryRange(ctx, query, start, end, step) + if err != nil { + log.Printf("[opencost] PromQL trend range query failed (range=%s): %v", label, err) + return &CostTrendResponse{Available: false, Reason: ReasonQueryError} + } + if len(result.Series) == 0 { + return &CostTrendResponse{Available: false, Reason: ReasonNoMetrics} + } + + type nsRank struct { + ns string + lastCost float64 + idx int + } + ranks := make([]nsRank, 0, len(result.Series)) + for i, s := range result.Series { + ns := s.Labels["namespace"] + if ns == "" { + continue + } + var last float64 + if len(s.DataPoints) > 0 { + last = s.DataPoints[len(s.DataPoints)-1].Value + } + ranks = append(ranks, nsRank{ns: ns, lastCost: last, idx: i}) + } + sort.Slice(ranks, func(i, j int) bool { return ranks[i].lastCost > ranks[j].lastCost }) + + topSet := make(map[int]bool, maxSeries) + series := make([]CostTrendSeries, 0, maxSeries+1) + for i, r := range ranks { + if i >= maxSeries { + break + } + topSet[r.idx] = true + s := result.Series[r.idx] + dps := make([]CostDataPoint, 0, len(s.DataPoints)) + for _, dp := range s.DataPoints { + dps = append(dps, CostDataPoint{Timestamp: dp.Timestamp, Value: roundTo(dp.Value, 4)}) + } + series = append(series, CostTrendSeries{Namespace: r.ns, DataPoints: dps}) + } + + if len(ranks) > maxSeries { + otherMap := make(map[int64]float64) + for i, s := range result.Series { + if topSet[i] { + continue + } + for _, dp := range s.DataPoints { + otherMap[dp.Timestamp] += dp.Value + } + } + if len(otherMap) > 0 { + dps := make([]CostDataPoint, 0, len(otherMap)) + for ts, val := range otherMap { + dps = append(dps, CostDataPoint{Timestamp: ts, Value: roundTo(val, 4)}) + } + sort.Slice(dps, func(i, j int) bool { return dps[i].Timestamp < dps[j].Timestamp }) + series = append(series, CostTrendSeries{Namespace: "other", DataPoints: dps}) + } + } + + return &CostTrendResponse{Available: true, Range: label, Series: series} +} + +// resolveTrendRange returns the start/end/step/label for the named Range. +func resolveTrendRange(rangeStr string) (start, end time.Time, step time.Duration, label string) { + end = time.Now() + switch rangeStr { + case "6h": + return end.Add(-6 * time.Hour), end, 15 * time.Minute, "6h" + case "7d": + return end.Add(-7 * 24 * time.Hour), end, 6 * time.Hour, "7d" + default: + return end.Add(-24 * time.Hour), end, time.Hour, "24h" + } +} diff --git a/pkg/opencost/trend_prom_test.go b/pkg/opencost/trend_prom_test.go new file mode 100644 index 000000000..d1915965e --- /dev/null +++ b/pkg/opencost/trend_prom_test.go @@ -0,0 +1,183 @@ +package opencost + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// matrixBody builds a Prometheus range-query (matrix) response for the +// given per-namespace series. Each series gets the same set of (ts, value) +// data points, with the last point used as the ranking value. +func matrixBody(series []namespaceSeries) string { + type point = []interface{} + type entry struct { + Metric map[string]string `json:"metric"` + Values []point `json:"values"` + } + body := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []entry `json:"result"` + } `json:"data"` + }{Status: "success"} + body.Data.ResultType = "matrix" + for _, s := range series { + values := make([]point, 0, len(s.points)) + for _, p := range s.points { + values = append(values, point{float64(p.ts), formatFloat(p.v)}) + } + body.Data.Result = append(body.Data.Result, entry{ + Metric: map[string]string{"namespace": s.ns}, + Values: values, + }) + } + b, _ := json.Marshal(body) + return string(b) +} + +type namespaceSeries struct { + ns string + points []dpoint +} +type dpoint struct { + ts int64 + v float64 +} + +func rangeProm(t *testing.T, body string) *prom.Client { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(body)) + })) + t.Cleanup(srv.Close) + return prom.NewClient(prom.NewHTTPTransport(srv.URL, "", nil)) +} + +func TestComputeCostTrendFromProm_TopNAndOther(t *testing.T) { + // 5 namespaces, ranked by latest value: a=10, b=8, c=5, d=3, e=1. + // MaxSeries=2 → top two (a, b) returned individually; c/d/e collapsed + // into a single "other" series with summed per-timestamp values. + client := rangeProm(t, matrixBody([]namespaceSeries{ + {"a", []dpoint{{1700000000, 9}, {1700003600, 10}}}, + {"b", []dpoint{{1700000000, 7}, {1700003600, 8}}}, + {"c", []dpoint{{1700000000, 4}, {1700003600, 5}}}, + {"d", []dpoint{{1700000000, 2}, {1700003600, 3}}}, + {"e", []dpoint{{1700000000, 1}, {1700003600, 1}}}, + })) + + got := ComputeCostTrendFromProm(context.Background(), client, TrendPromOptions{ + Range: "24h", + MaxSeries: 2, + }) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + if got.Range != "24h" { + t.Errorf("Range: got %q, want %q", got.Range, "24h") + } + if len(got.Series) != 3 { + t.Fatalf("expected 3 series (2 top + other), got %d: %v", len(got.Series), namesOf(got.Series)) + } + + // First two series are top-N by last value. + if got.Series[0].Namespace != "a" || got.Series[1].Namespace != "b" { + t.Errorf("top series: got [%s, %s], want [a, b]", got.Series[0].Namespace, got.Series[1].Namespace) + } + + // Third series is "other" — c+d+e summed per timestamp. + other := got.Series[2] + if other.Namespace != "other" { + t.Errorf("third series namespace: got %q, want %q", other.Namespace, "other") + } + if len(other.DataPoints) != 2 { + t.Fatalf("other should have 2 points, got %d", len(other.DataPoints)) + } + // Points are sorted by Timestamp ascending. + if other.DataPoints[0].Timestamp != 1700000000 { + t.Errorf("other[0].Timestamp: got %d", other.DataPoints[0].Timestamp) + } + // c+d+e at ts=1700000000 = 4+2+1 = 7 + if other.DataPoints[0].Value != 7 { + t.Errorf("other[0].Value: got %v, want 7", other.DataPoints[0].Value) + } + // c+d+e at ts=1700003600 = 5+3+1 = 9 + if other.DataPoints[1].Value != 9 { + t.Errorf("other[1].Value: got %v, want 9", other.DataPoints[1].Value) + } +} + +func TestComputeCostTrendFromProm_AllUnderMaxSeriesNoOther(t *testing.T) { + // 2 namespaces, MaxSeries=8 → no "other" series. + client := rangeProm(t, matrixBody([]namespaceSeries{ + {"a", []dpoint{{1700000000, 1}}}, + {"b", []dpoint{{1700000000, 2}}}, + })) + got := ComputeCostTrendFromProm(context.Background(), client, TrendPromOptions{Range: "24h"}) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + if len(got.Series) != 2 { + t.Errorf("expected 2 series (no 'other'), got %d: %v", len(got.Series), namesOf(got.Series)) + } + for _, s := range got.Series { + if s.Namespace == "other" { + t.Errorf("unexpected 'other' series with %d points: %+v", len(s.DataPoints), s.DataPoints) + } + } +} + +func TestComputeCostTrendFromProm_EmptyNamespaceLabelSkipped(t *testing.T) { + // A series with no namespace label must not appear in the output (it + // can't be ranked or attributed). The implementation skips it during + // the rank pass. + client := rangeProm(t, matrixBody([]namespaceSeries{ + {"", []dpoint{{1700000000, 99}}}, // would be top by value, but unnamed + {"a", []dpoint{{1700000000, 1}}}, + })) + got := ComputeCostTrendFromProm(context.Background(), client, TrendPromOptions{Range: "24h"}) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + for _, s := range got.Series { + if s.Namespace == "" { + t.Errorf("unexpected empty-namespace series in output: %+v", s) + } + } +} + +func TestComputeCostTrendFromProm_NilClient(t *testing.T) { + got := ComputeCostTrendFromProm(context.Background(), nil, TrendPromOptions{Range: "24h"}) + if got.Available { + t.Errorf("expected Available=false with nil client") + } + if got.Reason != ReasonNoPrometheus { + t.Errorf("Reason: got %q, want %q", got.Reason, ReasonNoPrometheus) + } +} + +func TestComputeCostTrendFromProm_NoSeries(t *testing.T) { + emptyBody := `{"status":"success","data":{"resultType":"matrix","result":[]}}` + client := rangeProm(t, emptyBody) + got := ComputeCostTrendFromProm(context.Background(), client, TrendPromOptions{Range: "24h"}) + if got.Available { + t.Errorf("expected Available=false on no series") + } + if got.Reason != ReasonNoMetrics { + t.Errorf("Reason: got %q, want %q", got.Reason, ReasonNoMetrics) + } +} + +func namesOf(series []CostTrendSeries) []string { + out := make([]string, len(series)) + for i, s := range series { + out[i] = s.Namespace + } + return out +} diff --git a/internal/opencost/types.go b/pkg/opencost/types.go similarity index 88% rename from internal/opencost/types.go rename to pkg/opencost/types.go index c977a53f8..8c8d22b61 100644 --- a/internal/opencost/types.go +++ b/pkg/opencost/types.go @@ -16,18 +16,24 @@ type CostSummary struct { Window string `json:"window,omitempty"` TotalHourlyCost float64 `json:"totalHourlyCost,omitempty"` TotalStorageCost float64 `json:"totalStorageCost,omitempty"` + TotalNetworkCost float64 `json:"totalNetworkCost,omitempty"` TotalIdleCost float64 `json:"totalIdleCost,omitempty"` ClusterEfficiency float64 `json:"clusterEfficiency,omitempty"` // 0-100 Namespaces []NamespaceCost `json:"namespaces,omitempty"` } -// NamespaceCost holds per-namespace cost breakdown. +// NamespaceCost holds per-row cost breakdown. The name reflects the +// default aggregation; the struct is also used for controller and pod +// rows — Kind disambiguates (empty = namespace). type NamespaceCost struct { Name string `json:"name"` + Kind string `json:"kind,omitempty"` // "namespace" (default if empty) | "controller" | "pod" + Namespace string `json:"namespace,omitempty"` // populated for controller/pod rows HourlyCost float64 `json:"hourlyCost"` CPUCost float64 `json:"cpuCost"` MemoryCost float64 `json:"memoryCost"` StorageCost float64 `json:"storageCost,omitempty"` + NetworkCost float64 `json:"networkCost,omitempty"` CPUUsageCost float64 `json:"cpuUsageCost,omitempty"` MemoryUsageCost float64 `json:"memoryUsageCost,omitempty"` Efficiency float64 `json:"efficiency,omitempty"` // 0-100 diff --git a/pkg/opencost/workloads.go b/pkg/opencost/workloads.go new file mode 100644 index 000000000..e94cee8fe --- /dev/null +++ b/pkg/opencost/workloads.go @@ -0,0 +1,175 @@ +package opencost + +import ( + "context" + "log" + "sort" + "strings" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// WorkloadOwner identifies a workload by name and kind. +type WorkloadOwner struct { + Name string + Kind string +} + +// PodOwnerLookup returns the workload owner for a pod name in a given +// namespace, or (false) if the lookup cannot resolve it. Callers with an +// in-process K8s informer cache supply this directly; callers without can +// satisfy it from any other pod-metadata source. Keeping the dependency +// abstract here keeps pkg/opencost free of k8s.io/client-go. +type PodOwnerLookup func(podName string) (WorkloadOwner, bool) + +// ComputeWorkloadsFromProm returns workload-level cost breakdown for a +// namespace, sourced from OpenCost-exported Prometheus metrics with a +// caller-supplied pod→owner mapping (typically from a K8s informer cache). +// +// When ownerLookup is nil or can't resolve a pod, the pod is assigned to a +// fallback "standalone" workload whose name is the pod name with its hash +// suffixes stripped — best-effort grouping for orphan pods. +func ComputeWorkloadsFromProm(ctx context.Context, client *prom.Client, namespace string, ownerLookup PodOwnerLookup) *WorkloadCostResponse { + if client == nil { + return &WorkloadCostResponse{Namespace: namespace, Available: false, Reason: ReasonNoPrometheus} + } + if namespace == "" { + return &WorkloadCostResponse{Available: false, Reason: ReasonQueryError} + } + + safeNS := prom.SanitizeLabelValue(namespace) + + cpuResult, err := client.Query(ctx, + `sum by (pod) ((avg_over_time(container_cpu_allocation{exported_namespace="`+safeNS+`"}[1h]) or avg_over_time(container_cpu_allocation{namespace="`+safeNS+`", exported_namespace=""}[1h])) * on(node) group_left() node_cpu_hourly_cost)`) + if err != nil { + log.Printf("[opencost] workloads CPU query failed for ns=%q, trying opencost_container_cpu_cost_total: %v", namespace, err) + cpuResult, err = client.Query(ctx, + `sum by (pod) (rate(opencost_container_cpu_cost_total{exported_namespace="`+safeNS+`"}[1h]) or rate(opencost_container_cpu_cost_total{namespace="`+safeNS+`", exported_namespace=""}[1h]))`) + if err != nil { + log.Printf("[opencost] workloads CPU fallback query also failed for ns=%q: %v", namespace, err) + return &WorkloadCostResponse{Namespace: namespace, Available: false, Reason: ReasonQueryError} + } + } + + memResult, err := client.Query(ctx, + `sum by (pod) ((avg_over_time(container_memory_allocation_bytes{exported_namespace="`+safeNS+`"}[1h]) or avg_over_time(container_memory_allocation_bytes{namespace="`+safeNS+`", exported_namespace=""}[1h])) / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) + if err != nil { + log.Printf("[opencost] workloads memory query failed for ns=%q, trying opencost_container_memory_cost_total: %v", namespace, err) + memResult, err = client.Query(ctx, + `sum by (pod) (rate(opencost_container_memory_cost_total{exported_namespace="`+safeNS+`"}[1h]) or rate(opencost_container_memory_cost_total{namespace="`+safeNS+`", exported_namespace=""}[1h]))`) + if err != nil { + log.Printf("[opencost] workloads memory fallback query also failed for ns=%q: %v", namespace, err) + return &WorkloadCostResponse{Namespace: namespace, Available: false, Reason: ReasonQueryError} + } + } + + cpuUsageResult, cpuUsageErr := client.Query(ctx, + `sum by (pod) (label_replace(rate(container_cpu_usage_seconds_total{container!="", namespace="`+safeNS+`"}[1h]), "node", "$1", "instance", "(.+?)(?::\\d+)?$") * on(node) group_left() node_cpu_hourly_cost)`) + if cpuUsageErr != nil { + log.Printf("[opencost] workloads CPU usage query failed for ns=%q (efficiency will be 0): %v", namespace, cpuUsageErr) + } + memUsageResult, memUsageErr := client.Query(ctx, + `sum by (pod) (label_replace(container_memory_working_set_bytes{container!="", namespace="`+safeNS+`"}, "node", "$1", "instance", "(.+?)(?::\\d+)?$") / 1073741824 * on(node) group_left() node_ram_hourly_cost)`) + if memUsageErr != nil { + log.Printf("[opencost] workloads memory usage query failed for ns=%q (efficiency will be 0): %v", namespace, memUsageErr) + } + + if len(cpuResult.Series) == 0 && len(memResult.Series) == 0 { + // Queries succeeded but returned nothing — either the namespace has + // no scraped pods or OpenCost metrics aren't present. Surface the + // typed reason so the UI can render contextual guidance rather than + // an empty list. + return &WorkloadCostResponse{Namespace: namespace, Available: false, Reason: ReasonNoMetrics} + } + + podCPUUsage := lastValuePerLabel(cpuUsageResult, cpuUsageErr, "pod") + podMemUsage := lastValuePerLabel(memUsageResult, memUsageErr, "pod") + + type podCost struct { + cpuCost, memoryCost, cpuUsage, memoryUsage float64 + } + podCosts := make(map[string]*podCost) + setPodLast := func(result *prom.QueryResult, set func(*podCost, float64)) { + if result == nil { + return + } + for _, s := range result.Series { + pod := s.Labels["pod"] + if pod == "" || len(s.DataPoints) == 0 { + continue + } + pc, ok := podCosts[pod] + if !ok { + pc = &podCost{} + podCosts[pod] = pc + } + set(pc, s.DataPoints[len(s.DataPoints)-1].Value) + } + } + setPodLast(cpuResult, func(pc *podCost, v float64) { pc.cpuCost = v }) + setPodLast(memResult, func(pc *podCost, v float64) { pc.memoryCost = v }) + for pod, pc := range podCosts { + pc.cpuUsage = podCPUUsage[pod] + pc.memoryUsage = podMemUsage[pod] + } + + workloadMap := make(map[WorkloadOwner]*WorkloadCost) + for podName, pc := range podCosts { + owner, ok := WorkloadOwner{}, false + if ownerLookup != nil { + owner, ok = ownerLookup(podName) + } + if !ok { + owner = WorkloadOwner{Name: stripPodSuffix(podName), Kind: "standalone"} + } + + wl, exists := workloadMap[owner] + if !exists { + wl = &WorkloadCost{Name: owner.Name, Kind: owner.Kind} + workloadMap[owner] = wl + } + wl.CPUCost += pc.cpuCost + wl.MemoryCost += pc.memoryCost + wl.CPUUsageCost += pc.cpuUsage + wl.MemoryUsageCost += pc.memoryUsage + wl.Replicas++ + } + + workloads := make([]WorkloadCost, 0, len(workloadMap)) + for _, wl := range workloadMap { + allocCost := wl.CPUCost + wl.MemoryCost + usageCost := wl.CPUUsageCost + wl.MemoryUsageCost + wl.HourlyCost = allocCost + wl.Efficiency = efficiencyPct(usageCost, allocCost) + wl.IdleCost = idleFromUsage(usageCost, allocCost) + wl.HourlyCost = roundTo(wl.HourlyCost, 4) + wl.CPUCost = roundTo(wl.CPUCost, 4) + wl.MemoryCost = roundTo(wl.MemoryCost, 4) + wl.CPUUsageCost = roundTo(wl.CPUUsageCost, 4) + wl.MemoryUsageCost = roundTo(wl.MemoryUsageCost, 4) + wl.IdleCost = roundTo(wl.IdleCost, 4) + workloads = append(workloads, *wl) + } + sort.Slice(workloads, func(i, j int) bool { return workloads[i].HourlyCost > workloads[j].HourlyCost }) + + return &WorkloadCostResponse{ + Available: true, + Namespace: namespace, + Workloads: workloads, + } +} + +// stripPodSuffix removes pod hash suffixes to approximate the workload name +// when owner-ref lookup fails. e.g. "myapp-7f8d9c-xyz12" → "myapp". +func stripPodSuffix(name string) string { + idx := strings.LastIndex(name, "-") + if idx <= 0 { + return name + } + name = name[:idx] + idx = strings.LastIndex(name, "-") + if idx <= 0 { + return name + } + return name[:idx] +} diff --git a/pkg/opencost/workloads_test.go b/pkg/opencost/workloads_test.go new file mode 100644 index 000000000..79c317a58 --- /dev/null +++ b/pkg/opencost/workloads_test.go @@ -0,0 +1,189 @@ +package opencost + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/skyhook-io/radar/pkg/prom" +) + +// podVectorBody builds a PromQL vector response where each result row has +// only a `pod` label — matching what `sum by (pod) (...)` queries return. +func podVectorBody(samples map[string]float64) string { + type result struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` + } + body := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []result `json:"result"` + } `json:"data"` + }{Status: "success"} + body.Data.ResultType = "vector" + for pod, v := range samples { + body.Data.Result = append(body.Data.Result, result{ + Metric: map[string]string{"pod": pod}, + Value: []interface{}{1700000000.0, formatFloat(v)}, + }) + } + b, _ := json.Marshal(body) + return string(b) +} + +// workloadsProm returns a prom.Client where every PromQL query returns the +// same canned pod-keyed body. +func workloadsProm(t *testing.T, body string) *prom.Client { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(body)) + })) + t.Cleanup(srv.Close) + return prom.NewClient(prom.NewHTTPTransport(srv.URL, "", nil)) +} + +func TestComputeWorkloads_OwnerLookupResolves(t *testing.T) { + // Three pods reported by PromQL; ownerLookup resolves all three to two + // distinct workloads. Replicas should be 2 + 1, not 3 standalone rows. + // worker pod cost (5.0) > sum of api pods (1.0 + 1.0 = 2.0) so sort + // is deterministic. The same vector body is returned for all four + // queries (CPU alloc, mem alloc, CPU usage, mem usage), so the + // per-pod HourlyCost is 2× the input value (cpu + mem). + client := workloadsProm(t, podVectorBody(map[string]float64{ + "api-7f8d9c-xyz12": 1.0, + "api-7f8d9c-abc34": 1.0, + "worker-deadbeef01": 5.0, + })) + lookup := func(pod string) (WorkloadOwner, bool) { + switch pod { + case "api-7f8d9c-xyz12", "api-7f8d9c-abc34": + return WorkloadOwner{Name: "api", Kind: "Deployment"}, true + case "worker-deadbeef01": + return WorkloadOwner{Name: "worker", Kind: "Job"}, true + } + return WorkloadOwner{}, false + } + got := ComputeWorkloadsFromProm(context.Background(), client, "default", lookup) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + if len(got.Workloads) != 2 { + t.Fatalf("expected 2 workloads, got %d: %+v", len(got.Workloads), got.Workloads) + } + // workloads are sorted descending by HourlyCost; worker (2.0 + 2.0 mem + // from the same query body) comes first. + if got.Workloads[0].Name != "worker" || got.Workloads[0].Kind != "Job" { + t.Errorf("first workload: got %s/%s, want worker/Job", got.Workloads[0].Name, got.Workloads[0].Kind) + } + if got.Workloads[0].Replicas != 1 { + t.Errorf("worker replicas: got %d, want 1", got.Workloads[0].Replicas) + } + if got.Workloads[1].Name != "api" || got.Workloads[1].Kind != "Deployment" { + t.Errorf("second workload: got %s/%s, want api/Deployment", got.Workloads[1].Name, got.Workloads[1].Kind) + } + if got.Workloads[1].Replicas != 2 { + t.Errorf("api replicas: got %d, want 2", got.Workloads[1].Replicas) + } +} + +func TestComputeWorkloads_OwnerLookupNilFallsBackToPodSuffixStrip(t *testing.T) { + // nil lookup → every pod falls through to stripPodSuffix; kind="standalone". + client := workloadsProm(t, podVectorBody(map[string]float64{ + "api-7f8d9c-xyz12": 1.0, + })) + got := ComputeWorkloadsFromProm(context.Background(), client, "default", nil) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + if len(got.Workloads) != 1 { + t.Fatalf("expected 1 workload, got %d", len(got.Workloads)) + } + if got.Workloads[0].Name != "api" || got.Workloads[0].Kind != "standalone" { + t.Errorf("got %s/%s, want api/standalone", got.Workloads[0].Name, got.Workloads[0].Kind) + } +} + +func TestComputeWorkloads_OwnerLookupUnresolvedPodFallsBack(t *testing.T) { + // Lookup resolves one pod, returns false for the other — false case must + // still produce a row (with the stripPodSuffix-derived name) rather than + // silently dropping the pod. + client := workloadsProm(t, podVectorBody(map[string]float64{ + "api-7f8d9c-xyz12": 1.0, + "orphan-pod-abc-123": 1.0, + })) + lookup := func(pod string) (WorkloadOwner, bool) { + if pod == "api-7f8d9c-xyz12" { + return WorkloadOwner{Name: "api", Kind: "Deployment"}, true + } + return WorkloadOwner{}, false + } + got := ComputeWorkloadsFromProm(context.Background(), client, "default", lookup) + if !got.Available { + t.Fatalf("expected Available=true, got %+v", got) + } + if len(got.Workloads) != 2 { + t.Fatalf("expected 2 workloads, got %d: %+v", len(got.Workloads), got.Workloads) + } + // Find the orphan — should have kind="standalone" and stripped name. + var orphan *WorkloadCost + for i := range got.Workloads { + if got.Workloads[i].Kind == "standalone" { + orphan = &got.Workloads[i] + break + } + } + if orphan == nil { + t.Fatalf("no standalone workload found in %+v", got.Workloads) + } + if orphan.Name != "orphan-pod" { + // stripPodSuffix strips two trailing -suffixes: orphan-pod-abc-123 → orphan-pod + t.Errorf("orphan name: got %q, want %q", orphan.Name, "orphan-pod") + } +} + +func TestComputeWorkloads_EmptyResultReturnsNoMetricsReason(t *testing.T) { + // Queries succeed but return zero series — should surface ReasonNoMetrics + // (not Available=true with empty workloads list). + emptyBody := `{"status":"success","data":{"resultType":"vector","result":[]}}` + client := workloadsProm(t, emptyBody) + got := ComputeWorkloadsFromProm(context.Background(), client, "default", nil) + if got.Available { + t.Errorf("expected Available=false on empty results, got Available=true") + } + if got.Reason != ReasonNoMetrics { + t.Errorf("Reason: got %q, want %q", got.Reason, ReasonNoMetrics) + } +} + +func TestComputeWorkloads_NilClient(t *testing.T) { + got := ComputeWorkloadsFromProm(context.Background(), nil, "default", nil) + if got.Available { + t.Errorf("expected Available=false with nil client") + } + if got.Reason != ReasonNoPrometheus { + t.Errorf("Reason: got %q, want %q", got.Reason, ReasonNoPrometheus) + } +} + +func TestStripPodSuffix(t *testing.T) { + cases := []struct { + in, want string + }{ + {"myapp-7f8d9c-xyz12", "myapp"}, // deployment pod (rs-hash + pod-hash) + {"myapp-xyz12", "myapp"}, // single suffix (e.g. CronJob) + {"mywf-step-1-abc12-xyz", "mywf-step-1"}, // multi-segment workflow name + {"plain", "plain"}, // no dashes + {"-leading", "-leading"}, // leading-dash edge case + } + for _, tc := range cases { + got := stripPodSuffix(tc.in) + if got != tc.want { + t.Errorf("stripPodSuffix(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} diff --git a/pkg/prom/client.go b/pkg/prom/client.go new file mode 100644 index 000000000..2ae91a06d --- /dev/null +++ b/pkg/prom/client.go @@ -0,0 +1,172 @@ +package prom + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/url" + "strconv" + "time" +) + +// Client is a Prometheus HTTP API client that delegates all network calls to +// the injected Transport. The Client itself is stateless with respect to +// discovery — callers are responsible for constructing an appropriate +// Transport (direct HTTP, kubectl port-forward, or any other tunnel). +type Client struct { + t Transport +} + +// NewClient wraps the given Transport. +func NewClient(t Transport) *Client { + return &Client{t: t} +} + +// Query executes an instant PromQL query. +func (c *Client) Query(ctx context.Context, promQL string) (*QueryResult, error) { + return c.issueQuery(ctx, "/api/v1/query", url.Values{"query": {promQL}}) +} + +// QueryRange executes a PromQL range query. +func (c *Client) QueryRange(ctx context.Context, promQL string, start, end time.Time, step time.Duration) (*QueryResult, error) { + params := url.Values{ + "query": {promQL}, + "start": {strconv.FormatInt(start.Unix(), 10)}, + "end": {strconv.FormatInt(end.Unix(), 10)}, + "step": {fmt.Sprintf("%.0f", step.Seconds())}, + } + return c.issueQuery(ctx, "/api/v1/query_range", params) +} + +func (c *Client) issueQuery(ctx context.Context, path string, params url.Values) (*QueryResult, error) { + body, err := c.t.Do(ctx, "GET", path, params) + if err != nil { + return nil, err + } + + var pr promResponse + if err := json.Unmarshal(body, &pr); err != nil { + return nil, fmt.Errorf("prom: parse response from %s: %w", c.t.Address(), err) + } + if pr.Status != "success" { + return nil, fmt.Errorf("prom: query error from %s: %s (%s)", c.t.Address(), pr.Error, pr.ErrorType) + } + return parseQueryResult(pr.Data) +} + +// ProbeReason explains a Probe result. An empty string on true = ok. +// On false, Reason indicates why discovery should skip this candidate. +type ProbeReason string + +const ( + ProbeReasonTransportError ProbeReason = "transport_error" // network/HTTP failure + ProbeReasonAuthError ProbeReason = "auth_error" // HTTP 401/403 — credentials rejected + ProbeReasonNotPrometheus ProbeReason = "not_prometheus" // 200 but response body isn't prom JSON (captive portal, login page) + ProbeReasonPromError ProbeReason = "prom_error" // prom responded with status=error + ProbeReasonEmptyInstance ProbeReason = "empty_instance" // prom responded success but zero "up" results +) + +// Probe checks if a Prometheus endpoint is reachable and has at least one +// active scrape target. Returns (ok, reason). When ok is true the reason is +// empty; when ok is false the reason indicates why (callers may use this +// for targeted logging — e.g., warn once per empty-instance discovery +// skip). +// +// Uses a 3-second timeout regardless of the context deadline to fail fast. +func (c *Client) Probe(ctx context.Context) (bool, ProbeReason) { + probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + body, err := c.t.Do(probeCtx, "GET", "/api/v1/query", url.Values{"query": {"up"}}) + if err != nil { + var httpErr *HTTPError + if errors.As(err, &httpErr) && (httpErr.StatusCode == 401 || httpErr.StatusCode == 403) { + return false, ProbeReasonAuthError + } + return false, ProbeReasonTransportError + } + + var pr struct { + Status string `json:"status"` + Data struct { + Result []json.RawMessage `json:"result"` + } `json:"data"` + } + if err := json.Unmarshal(body, &pr); err != nil { + return false, ProbeReasonNotPrometheus + } + if pr.Status != "success" { + return false, ProbeReasonPromError + } + if len(pr.Data.Result) == 0 { + return false, ProbeReasonEmptyInstance + } + return true, "" +} + +func parseQueryResult(data json.RawMessage) (*QueryResult, error) { + var raw struct { + ResultType string `json:"resultType"` + Result []struct { + Metric map[string]string `json:"metric"` + Values [][]interface{} `json:"values"` // for matrix + Value []interface{} `json:"value"` // for vector + } `json:"result"` + } + + if err := json.Unmarshal(data, &raw); err != nil { + return nil, fmt.Errorf("prom: parse result: %w", err) + } + + result := &QueryResult{ + ResultType: raw.ResultType, + Series: make([]Series, 0, len(raw.Result)), + } + + for _, r := range raw.Result { + series := Series{Labels: r.Metric} + + switch raw.ResultType { + case "matrix": + series.DataPoints = make([]DataPoint, 0, len(r.Values)) + for _, v := range r.Values { + if dp, ok := parseDataPoint(v); ok { + series.DataPoints = append(series.DataPoints, dp) + } + } + case "vector": + if r.Value != nil { + if dp, ok := parseDataPoint(r.Value); ok { + series.DataPoints = []DataPoint{dp} + } + } + } + + result.Series = append(result.Series, series) + } + + return result, nil +} + +func parseDataPoint(v []interface{}) (DataPoint, bool) { + if len(v) != 2 { + return DataPoint{}, false + } + + ts, ok := v[0].(float64) + if !ok { + return DataPoint{}, false + } + + valStr, sok := v[1].(string) + if !sok { + return DataPoint{}, false + } + val, err := strconv.ParseFloat(valStr, 64) + if err != nil { + return DataPoint{}, false + } + + return DataPoint{Timestamp: int64(ts), Value: val}, true +} diff --git a/pkg/prom/client_test.go b/pkg/prom/client_test.go new file mode 100644 index 000000000..dd8333ca0 --- /dev/null +++ b/pkg/prom/client_test.go @@ -0,0 +1,177 @@ +package prom + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// fakeProm returns an HTTPTransport pointed at a test server with a scripted +// response for /api/v1/query and /api/v1/query_range. +func fakeProm(t *testing.T, handler http.HandlerFunc) *HTTPTransport { + t.Helper() + srv := httptest.NewServer(handler) + t.Cleanup(srv.Close) + return NewHTTPTransport(srv.URL, "", nil) +} + +func TestClient_Query_ParsesVector(t *testing.T) { + body := `{ + "status":"success", + "data":{ + "resultType":"vector", + "result":[ + {"metric":{"namespace":"checkout"},"value":[1700000000, "42.5"]} + ] + } + }` + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + if !strings.HasSuffix(r.URL.Path, "/api/v1/query") { + t.Errorf("unexpected path %q", r.URL.Path) + } + if got := r.URL.Query().Get("query"); got != "up" { + t.Errorf("query param = %q, want up", got) + } + _, _ = w.Write([]byte(body)) + }) + + c := NewClient(tr) + res, err := c.Query(context.Background(), "up") + if err != nil { + t.Fatalf("Query: %v", err) + } + if res.ResultType != "vector" || len(res.Series) != 1 { + t.Fatalf("bad result: %+v", res) + } + s := res.Series[0] + if s.Labels["namespace"] != "checkout" { + t.Errorf("label: %v", s.Labels) + } + if len(s.DataPoints) != 1 || s.DataPoints[0].Timestamp != 1700000000 || s.DataPoints[0].Value != 42.5 { + t.Errorf("datapoint: %+v", s.DataPoints) + } +} + +func TestClient_QueryRange_ParsesMatrix(t *testing.T) { + body := `{ + "status":"success", + "data":{ + "resultType":"matrix", + "result":[ + {"metric":{"pod":"p1"},"values":[[1700000000,"1"],[1700000060,"2"]]} + ] + } + }` + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + if !strings.HasSuffix(r.URL.Path, "/api/v1/query_range") { + t.Errorf("unexpected path %q", r.URL.Path) + } + if r.URL.Query().Get("step") == "" { + t.Error("step missing") + } + _, _ = w.Write([]byte(body)) + }) + + c := NewClient(tr) + res, err := c.QueryRange(context.Background(), `rate(x[1m])`, + time.Unix(1700000000, 0), time.Unix(1700000060, 0), 30*time.Second) + if err != nil { + t.Fatalf("QueryRange: %v", err) + } + if res.ResultType != "matrix" || len(res.Series[0].DataPoints) != 2 { + t.Fatalf("bad result: %+v", res) + } +} + +func TestClient_Query_PropagatesPromError(t *testing.T) { + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"status":"error","errorType":"bad_data","error":"parse error"}`)) + }) + c := NewClient(tr) + _, err := c.Query(context.Background(), "up") + if err == nil || !strings.Contains(err.Error(), "parse error") { + t.Errorf("expected prom error, got %v", err) + } +} + +func TestClient_Query_HTTPErrorIsTyped(t *testing.T) { + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte("upstream busy")) + }) + c := NewClient(tr) + _, err := c.Query(context.Background(), "up") + if err == nil { + t.Fatal("expected error") + } + var httpErr *HTTPError + if !errors.As(err, &httpErr) { + t.Fatalf("want *HTTPError, got %T: %v", err, err) + } + if httpErr.StatusCode != http.StatusBadGateway { + t.Errorf("status: %d", httpErr.StatusCode) + } +} + +func TestClient_Probe_RejectsEmptyInstance(t *testing.T) { + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + }) + c := NewClient(tr) + ok, reason := c.Probe(context.Background()) + if ok { + t.Error("probe should reject instance with empty up result") + } + if reason != ProbeReasonEmptyInstance { + t.Errorf("reason = %q, want empty_instance", reason) + } +} + +func TestClient_Probe_AcceptsActiveInstance(t *testing.T) { + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1,"1"]}]}}`)) + }) + c := NewClient(tr) + ok, reason := c.Probe(context.Background()) + if !ok { + t.Error("probe should accept active instance") + } + if reason != "" { + t.Errorf("reason should be empty on success, got %q", reason) + } +} + +func TestClient_Probe_RejectsNonPromBody(t *testing.T) { + tr := fakeProm(t, func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`captive portal`)) + }) + c := NewClient(tr) + ok, reason := c.Probe(context.Background()) + if ok { + t.Error("probe should reject non-JSON body") + } + if reason != ProbeReasonNotPrometheus { + t.Errorf("reason = %q, want not_prometheus", reason) + } +} + +func TestHTTPTransport_BasePathIncluded(t *testing.T) { + var capturedPath string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedPath = r.URL.Path + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + defer srv.Close() + + tr := NewHTTPTransport(srv.URL, "/select/0/prometheus", nil) + c := NewClient(tr) + _, _ = c.Query(context.Background(), "up") + if capturedPath != "/select/0/prometheus/api/v1/query" { + t.Errorf("base path not applied: got %q", capturedPath) + } +} + diff --git a/pkg/prom/discovery.go b/pkg/prom/discovery.go new file mode 100644 index 000000000..88f815bf8 --- /dev/null +++ b/pkg/prom/discovery.go @@ -0,0 +1,299 @@ +package prom + +import ( + "context" + "fmt" + "sort" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// CandidateSource describes how a candidate was found. +type CandidateSource string + +const ( + CandidateSourceWellKnown CandidateSource = "well_known" + CandidateSourceDynamic CandidateSource = "dynamic" +) + +// Candidate is a Prometheus-compatible service the caller can attempt to +// reach. Discover populates the fields and orders candidates by priority, +// but does not probe — it leaves the transport choice (direct HTTP vs. +// port-forward vs. tunneled proxy) to the caller. +type Candidate struct { + Namespace string + Name string + Port int // service port (for in-cluster addressing) + TargetPort int // container port (for port-forwarding to the pod) + ClusterAddr string // http://{name}.{ns}.svc.cluster.local:{port} + BasePath string // e.g. "/select/0/prometheus" for vmselect + Score int // relative likelihood of being Prometheus + Source CandidateSource // well_known | dynamic +} + +// DiscoverOptions tunes Discover's behavior. +type DiscoverOptions struct { + // IncludeDynamic controls whether a cluster-wide service scan is performed. + // The scan is an O(all services) List call plus a scoring pass; skip it + // for callers that only need a quick well-known check. + IncludeDynamic bool + + // MaxDynamic caps the number of dynamic candidates returned. Default 5. + MaxDynamic int + + // Logger is optional; if set, Discover emits verbose progress messages. + Logger func(format string, args ...interface{}) +} + +// WellKnownLocations is the ordered list of namespaces + service names where +// Prometheus-compatible services are commonly installed. +var WellKnownLocations = []struct { + Namespace string + Name string + Port int // 0 = use service's first port + BasePath string // sub-path for Prometheus API +}{ + // VictoriaMetrics — monitoring namespace first (workload metrics) + {"monitoring", "victoria-metrics-victoria-metrics-single-server", 8428, ""}, + {"monitoring", "victoria-metrics-single-server", 8428, ""}, + {"monitoring", "vmsingle", 8428, ""}, + {"monitoring", "vmselect", 8481, "/select/0/prometheus"}, + {"victoria-metrics", "victoria-metrics-victoria-metrics-single-server", 8428, ""}, + {"victoria-metrics", "victoria-metrics-single-server", 8428, ""}, + {"victoria-metrics", "vmsingle", 8428, ""}, + {"victoria-metrics", "vmselect", 8481, "/select/0/prometheus"}, + // kube-prometheus-stack + {"monitoring", "kube-prometheus-stack-prometheus", 9090, ""}, + {"monitoring", "prometheus-kube-prometheus-prometheus", 9090, ""}, + {"monitoring", "prometheus-operated", 9090, ""}, + // Standard Prometheus + {"opencost", "prometheus-server", 0, ""}, + {"monitoring", "prometheus-server", 0, ""}, + {"prometheus", "prometheus-server", 0, ""}, + {"observability", "prometheus-server", 0, ""}, + {"metrics", "prometheus-server", 0, ""}, + {"kube-system", "prometheus", 0, ""}, + {"default", "prometheus", 0, ""}, + // VictoriaMetrics — caretta namespace (traffic-specific, may lack workload metrics) + {"caretta", "caretta-vm", 8428, ""}, +} + +// metricsNamespaces are commonly used for metrics services; used as a scoring +// signal in dynamic discovery. +var metricsNamespaces = map[string]bool{ + "monitoring": true, + "prometheus": true, + "observability": true, + "metrics": true, + "victoria-metrics": true, + "caretta": true, + "opencost": true, +} + +// skipNamespaces are excluded from dynamic discovery. +var skipNamespaces = map[string]bool{ + "kube-public": true, + "kube-node-lease": true, +} + +// Discover enumerates candidate Prometheus-compatible services reachable to +// the given k8sClient. Well-known locations are returned first in declared +// priority order, optionally followed by dynamically-discovered services +// ranked by ScoreService. +// +// Discover does NOT probe any candidate — callers decide how to reach each +// (direct HTTP, port-forward, tunneled proxy) and then use +// pkg/prom.Client.Probe to validate. +func Discover(ctx context.Context, k8sClient kubernetes.Interface, opts DiscoverOptions) ([]Candidate, error) { + if k8sClient == nil { + return nil, fmt.Errorf("prom.Discover: k8sClient is nil") + } + if opts.MaxDynamic <= 0 { + opts.MaxDynamic = 5 + } + logf := opts.Logger + if logf == nil { + logf = func(string, ...interface{}) {} + } + + var out []Candidate + + // Layer 1: well-known locations. Preserve declared order for determinism. + for _, loc := range WellKnownLocations { + svc, err := k8sClient.CoreV1().Services(loc.Namespace).Get(ctx, loc.Name, metav1.GetOptions{}) + if err != nil { + if !apierrors.IsNotFound(err) { + logf("prom.Discover: error checking %s/%s: %v", loc.Namespace, loc.Name, err) + } + continue + } + port := resolvePort(*svc, loc.Port) + out = append(out, Candidate{ + Namespace: svc.Namespace, + Name: svc.Name, + Port: port, + TargetPort: resolveTargetPort(*svc, port), + ClusterAddr: buildClusterAddr(svc.Name, svc.Namespace, svc.Spec.ClusterIP, port), + BasePath: loc.BasePath, + Source: CandidateSourceWellKnown, + }) + } + + if !opts.IncludeDynamic { + return out, nil + } + + // Layer 2: dynamic cluster-wide scan, scored + sorted. + svcs, err := k8sClient.CoreV1().Services("").List(ctx, metav1.ListOptions{}) + if err != nil { + logf("prom.Discover: failed to list services: %v", err) + return out, nil // well-known results still useful + } + + var scored []Candidate + for _, svc := range svcs.Items { + score, bp := ScoreService(svc) + if score <= 0 { + continue + } + port := resolvePort(svc, 0) + scored = append(scored, Candidate{ + Namespace: svc.Namespace, + Name: svc.Name, + Port: port, + TargetPort: resolveTargetPort(svc, port), + ClusterAddr: buildClusterAddr(svc.Name, svc.Namespace, svc.Spec.ClusterIP, port), + BasePath: bp, + Score: score, + Source: CandidateSourceDynamic, + }) + } + + sort.Slice(scored, func(i, j int) bool { + return scored[i].Score > scored[j].Score + }) + + if len(scored) > opts.MaxDynamic { + scored = scored[:opts.MaxDynamic] + } + return append(out, scored...), nil +} + +// ScoreService computes a heuristic score for a service being +// Prometheus-compatible. Returns the score and an inferred BasePath for +// vmselect-style services. +func ScoreService(svc corev1.Service) (score int, basePath string) { + if svc.Spec.Type == corev1.ServiceTypeExternalName { + return 0, "" + } + if skipNamespaces[svc.Namespace] { + return 0, "" + } + + labels := svc.Labels + appName := labels["app.kubernetes.io/name"] + appLabel := labels["app"] + component := labels["app.kubernetes.io/component"] + + switch appName { + case "prometheus": + score += 100 + case "victoria-metrics-single", "vmsingle": + score += 100 + case "vmselect": + score += 90 + basePath = "/select/0/prometheus" + case "thanos-query", "thanos-querier": + score += 80 + } + + switch appLabel { + case "prometheus", "prometheus-server": + score += 80 + case "vmsingle": + score += 80 + case "vmselect": + score += 80 + basePath = "/select/0/prometheus" + } + + if score > 0 && component == "server" { + score += 20 + } + + for _, p := range svc.Spec.Ports { + switch p.Port { + case 9090: // Prometheus default + score += 30 + case 8428: // VictoriaMetrics single-node default + score += 30 + case 8481: // VictoriaMetrics vmselect default + score += 25 + case 9009: // Thanos Query default + score += 25 + } + if strings.Contains(strings.ToLower(p.Name), "prometheus") { + score += 10 + } + } + + nameLower := strings.ToLower(svc.Name) + if strings.Contains(nameLower, "prometheus") { + score += 20 + } + if strings.Contains(nameLower, "victoria") || strings.Contains(nameLower, "vmsingle") || strings.Contains(nameLower, "vmselect") { + score += 20 + if strings.Contains(nameLower, "vmselect") && basePath == "" { + basePath = "/select/0/prometheus" + } + } + if strings.Contains(nameLower, "thanos") { + score += 15 + } + + if metricsNamespaces[svc.Namespace] { + score += 10 + } + + return score, basePath +} + +func resolvePort(svc corev1.Service, defaultPort int) int { + if defaultPort != 0 { + return defaultPort + } + if len(svc.Spec.Ports) > 0 { + return int(svc.Spec.Ports[0].Port) + } + return 80 +} + +// resolveTargetPort returns the container port, for port-forwarding which +// bypasses the Service. When the service port differs from the container's +// targetPort (e.g., service:80 → container:9090), port-forward needs the +// container port. +func resolveTargetPort(svc corev1.Service, servicePort int) int { + for _, p := range svc.Spec.Ports { + if int(p.Port) == servicePort { + if p.TargetPort.IntVal > 0 { + return int(p.TargetPort.IntVal) + } + return servicePort + } + } + return servicePort +} + +// buildClusterAddr returns the in-cluster HTTP URL for a service. Headless +// services (ClusterIP=None) use a pod-0 hostname; this is best-effort and +// really meant for stateful Prometheus deployments with predictable names. +func buildClusterAddr(name, namespace, clusterIP string, port int) string { + if clusterIP == "None" { + return fmt.Sprintf("http://%s-0.%s.%s.svc.cluster.local:%d", name, name, namespace, port) + } + return fmt.Sprintf("http://%s.%s.svc.cluster.local:%d", name, namespace, port) +} diff --git a/pkg/prom/discovery_test.go b/pkg/prom/discovery_test.go new file mode 100644 index 000000000..6d3b85743 --- /dev/null +++ b/pkg/prom/discovery_test.go @@ -0,0 +1,242 @@ +package prom + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes/fake" +) + +func TestScoreService_TableDriven(t *testing.T) { + tests := []struct { + name string + svc corev1.Service + wantMin int + wantMax int + wantBasePath string + }{ + { + name: "plain prometheus by app.kubernetes.io/name + port", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "prometheus-server", + Namespace: "monitoring", + Labels: map[string]string{"app.kubernetes.io/name": "prometheus"}, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{{Port: 9090}}, + }, + }, + wantMin: 100 + 30 + 20 + 10, // name + port + name-contains + metrics ns + wantMax: 500, + }, + { + name: "vmselect sets basePath", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vmselect", + Namespace: "monitoring", + Labels: map[string]string{"app.kubernetes.io/name": "vmselect"}, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{{Port: 8481}}, + }, + }, + wantMin: 90 + 25 + 20 + 10, + wantMax: 200, + wantBasePath: "/select/0/prometheus", + }, + { + name: "thanos-query scores lower than prometheus but non-zero", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "thanos-query", + Namespace: "observability", + Labels: map[string]string{"app.kubernetes.io/name": "thanos-query"}, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{{Port: 9009}}, + }, + }, + wantMin: 80 + 25 + 15 + 10, + wantMax: 200, + }, + { + name: "unrelated service scores zero", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "redis", + Namespace: "default", + Labels: map[string]string{"app": "redis"}, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{{Port: 6379}}, + }, + }, + wantMax: 0, + }, + { + name: "ExternalName excluded", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "prometheus", Namespace: "monitoring"}, + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeExternalName}, + }, + wantMax: 0, + }, + { + name: "skip-namespace excluded", + svc: corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "prometheus", + Namespace: "kube-public", + Labels: map[string]string{"app.kubernetes.io/name": "prometheus"}, + }, + }, + wantMax: 0, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + score, bp := ScoreService(tc.svc) + if score < tc.wantMin || (tc.wantMax > 0 && score > tc.wantMax) { + t.Errorf("score=%d, want in [%d, %d]", score, tc.wantMin, tc.wantMax) + } + if tc.wantMax == 0 && score != 0 { + t.Errorf("score=%d, want 0", score) + } + if tc.wantBasePath != "" && bp != tc.wantBasePath { + t.Errorf("basePath=%q, want %q", bp, tc.wantBasePath) + } + }) + } +} + +func TestDiscover_WellKnownFirst(t *testing.T) { + // Install a standard prometheus-server at a well-known location + // plus an unrelated redis service. + wellKnown := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "prometheus-server", Namespace: "monitoring"}, + Spec: corev1.ServiceSpec{ + ClusterIP: "10.0.0.1", + Ports: []corev1.ServicePort{{Port: 80, TargetPort: intstr.FromInt(9090)}}, + }, + } + redis := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "redis", Namespace: "default"}, + Spec: corev1.ServiceSpec{ + ClusterIP: "10.0.0.2", + Ports: []corev1.ServicePort{{Port: 6379}}, + }, + } + // Install an additional unknown-but-scoring dynamic candidate. + thanos := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "thanos-query", + Namespace: "observability", + Labels: map[string]string{"app.kubernetes.io/name": "thanos-query"}, + }, + Spec: corev1.ServiceSpec{ + ClusterIP: "10.0.0.3", + Ports: []corev1.ServicePort{{Port: 9009}}, + }, + } + + k8s := fake.NewSimpleClientset(wellKnown, redis, thanos) + cands, err := Discover(context.Background(), k8s, DiscoverOptions{IncludeDynamic: true, MaxDynamic: 3}) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(cands) < 2 { + t.Fatalf("want at least 2 candidates, got %d", len(cands)) + } + + // First must be the well-known match. + if cands[0].Source != CandidateSourceWellKnown { + t.Errorf("cands[0].Source = %q, want well_known", cands[0].Source) + } + if cands[0].Namespace != "monitoring" || cands[0].Name != "prometheus-server" { + t.Errorf("cands[0] = %s/%s, want monitoring/prometheus-server", cands[0].Namespace, cands[0].Name) + } + if cands[0].ClusterAddr != "http://prometheus-server.monitoring.svc.cluster.local:80" { + t.Errorf("cluster addr = %q", cands[0].ClusterAddr) + } + if cands[0].TargetPort != 9090 { + t.Errorf("TargetPort = %d, want 9090", cands[0].TargetPort) + } + + // Dynamic thanos match should be present. + var sawDynamicThanos bool + for _, c := range cands { + if c.Source == CandidateSourceDynamic && c.Name == "thanos-query" { + sawDynamicThanos = true + break + } + } + if !sawDynamicThanos { + t.Errorf("expected dynamic thanos candidate; got %+v", cands) + } + + // Redis must not appear in any form. + for _, c := range cands { + if c.Name == "redis" { + t.Errorf("redis should not be a candidate: %+v", c) + } + } +} + +func TestDiscover_SkipsDynamicWhenDisabled(t *testing.T) { + // Only a dynamic-scoring service is present (no well-known match). + prom := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-prometheus", + Namespace: "observability", + Labels: map[string]string{"app.kubernetes.io/name": "prometheus"}, + }, + Spec: corev1.ServiceSpec{ + ClusterIP: "10.0.0.5", + Ports: []corev1.ServicePort{{Port: 9090}}, + }, + } + + k8s := fake.NewSimpleClientset(prom) + cands, err := Discover(context.Background(), k8s, DiscoverOptions{IncludeDynamic: false}) + if err != nil { + t.Fatal(err) + } + if len(cands) != 0 { + t.Errorf("expected no candidates when dynamic is disabled and no well-known match; got %d", len(cands)) + } +} + +func TestDiscover_HeadlessServiceProducesPod0Addr(t *testing.T) { + headless := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "prometheus-server", Namespace: "monitoring"}, + Spec: corev1.ServiceSpec{ + ClusterIP: "None", + Ports: []corev1.ServicePort{{Port: 9090}}, + }, + } + k8s := fake.NewSimpleClientset(headless) + cands, err := Discover(context.Background(), k8s, DiscoverOptions{}) + if err != nil { + t.Fatal(err) + } + if len(cands) != 1 { + t.Fatalf("want 1 candidate, got %d", len(cands)) + } + want := "http://prometheus-server-0.prometheus-server.monitoring.svc.cluster.local:9090" + if cands[0].ClusterAddr != want { + t.Errorf("cluster addr = %q, want %q", cands[0].ClusterAddr, want) + } +} + +func TestDiscover_NilClient(t *testing.T) { + _, err := Discover(context.Background(), nil, DiscoverOptions{}) + if err == nil { + t.Error("expected error for nil client") + } +} diff --git a/internal/prometheus/queries.go b/pkg/prom/queries.go similarity index 96% rename from internal/prometheus/queries.go rename to pkg/prom/queries.go index c8fecbfc7..75192c558 100644 --- a/internal/prometheus/queries.go +++ b/pkg/prom/queries.go @@ -1,4 +1,4 @@ -package prometheus +package prom import ( "fmt" @@ -16,10 +16,10 @@ func SanitizeLabelValue(s string) string { }) } -// escapeRegexMeta escapes regex metacharacters for PromQL =~ matching. +// EscapeRegexMeta escapes regex metacharacters for PromQL =~ matching. var regexMeta = regexp.MustCompile(`([.+*?^${}()|[\]\\])`) -func escapeRegexMeta(s string) string { +func EscapeRegexMeta(s string) string { return regexMeta.ReplaceAllString(s, `\\$1`) } @@ -199,9 +199,9 @@ func buildClusterQueryInner(category MetricCategory, filterContainer bool) strin } } -// categoryUsesContainerFilter returns true if the category's queries include -// the container!='' filter that may need fallback on cri-docker clusters. -func categoryUsesContainerFilter(category MetricCategory) bool { +// CategoryUsesContainerFilter returns true if the category's queries include +// the container!='' filter that may need a fallback on cri-docker clusters. +func CategoryUsesContainerFilter(category MetricCategory) bool { return category == CategoryCPU || category == CategoryMemory } @@ -249,7 +249,7 @@ func buildPodQuery(namespace, podName string, category MetricCategory, filterCon func buildWorkloadQuery(namespace, workloadName string, category MetricCategory, filterContainer bool) string { ns := SanitizeLabelValue(namespace) // Sanitize then escape regex metacharacters so e.g. "my.app" matches literally - podPattern := fmt.Sprintf("%s-.*", escapeRegexMeta(SanitizeLabelValue(workloadName))) + podPattern := fmt.Sprintf("%s-.*", EscapeRegexMeta(SanitizeLabelValue(workloadName))) cf := "" if filterContainer { cf = "container!=''," @@ -290,7 +290,7 @@ func buildNodeQuery(nodeName string, category MetricCategory) string { // name or IP. The value often includes a port suffix, so we match with an optional port. // This heuristic works for most standard deployments; clusters with custom relabeling // may need the --prometheus-url flag plus adjusted recording rules. - sanitized := escapeRegexMeta(SanitizeLabelValue(nodeName)) + sanitized := EscapeRegexMeta(SanitizeLabelValue(nodeName)) nodeFilter := fmt.Sprintf(`instance=~'%s(:\\d+)?'`, sanitized) switch category { diff --git a/pkg/prom/transport.go b/pkg/prom/transport.go new file mode 100644 index 000000000..d1dfbad53 --- /dev/null +++ b/pkg/prom/transport.go @@ -0,0 +1,115 @@ +package prom + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// Transport is the pluggable HTTP transport used by Client to issue requests +// to a Prometheus HTTP API. Implementations decide how the request physically +// reaches Prometheus — typically either direct HTTP against a known URL +// (in-cluster, or a kubectl port-forwarded localhost) or a tunneled proxy +// transport that forwards requests through some external broker to an +// in-cluster Prometheus. +// +// Transport is responsible for returning the raw upstream body bytes. Parsing +// is the Client's concern. +type Transport interface { + Do(ctx context.Context, method, path string, params url.Values) ([]byte, error) + + // Address returns a human-readable identifier for this transport, used + // for status reporting and error messages — typically the base URL, or + // a short description of the proxy path for tunneled transports. + Address() string +} + +// HTTPTransport is a direct-HTTP Transport. It targets BaseURL + BasePath + +// the request path, and uses HTTPClient to send the request. +// +// BasePath is an optional prefix applied before Prometheus API paths and is +// useful for vmselect-style deployments where the API lives under e.g. +// "/select/0/prometheus". +// +// Headers, if non-empty, are applied to every request after the default +// Accept header, so callers may override Accept by setting it here. Typical +// uses are Authorization: Bearer ... and tenant headers like X-Scope-OrgID. +type HTTPTransport struct { + BaseURL string + BasePath string + HTTPClient *http.Client + Headers map[string]string +} + +// NewHTTPTransport constructs an HTTPTransport with a default 10-second +// timeout if none is provided. +func NewHTTPTransport(baseURL, basePath string, httpClient *http.Client) *HTTPTransport { + if httpClient == nil { + httpClient = &http.Client{Timeout: 10 * time.Second} + } + return &HTTPTransport{ + BaseURL: strings.TrimRight(baseURL, "/"), + BasePath: basePath, + HTTPClient: httpClient, + } +} + +// Do issues a request and returns the response body bytes. Non-2xx status +// codes yield a *HTTPError; callers can use errors.As to extract the +// status code and upstream body (Probe distinguishes 401/403 from other +// transport errors this way, for example). +func (t *HTTPTransport) Do(ctx context.Context, method, path string, params url.Values) ([]byte, error) { + full := t.BaseURL + t.BasePath + path + if len(params) > 0 { + if strings.Contains(full, "?") { + full = full + "&" + params.Encode() + } else { + full = full + "?" + params.Encode() + } + } + + req, err := http.NewRequestWithContext(ctx, method, full, nil) + if err != nil { + return nil, fmt.Errorf("prom.HTTPTransport: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + for k, v := range t.Headers { + req.Header.Set(k, v) + } + + resp, err := t.HTTPClient.Do(req) + if err != nil { + return nil, fmt.Errorf("prom.HTTPTransport: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MiB cap + if err != nil { + return nil, fmt.Errorf("prom.HTTPTransport: read body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, &HTTPError{StatusCode: resp.StatusCode, URL: full, Body: body} + } + return body, nil +} + +// Address returns the effective base URL for diagnostics. +func (t *HTTPTransport) Address() string { + return t.BaseURL + t.BasePath +} + +// HTTPError is returned when Prometheus responds with a non-2xx status. +type HTTPError struct { + StatusCode int + URL string + Body []byte +} + +func (e *HTTPError) Error() string { + return fmt.Sprintf("prometheus returned %d for %s: %s", e.StatusCode, e.URL, string(e.Body)) +} diff --git a/pkg/prom/transport_test.go b/pkg/prom/transport_test.go new file mode 100644 index 000000000..7d0ec43e2 --- /dev/null +++ b/pkg/prom/transport_test.go @@ -0,0 +1,57 @@ +package prom + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" +) + +func TestHTTPTransport_AppliesHeaders(t *testing.T) { + var gotAuth, gotTenant, gotAccept string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + gotTenant = r.Header.Get("X-Scope-OrgID") + gotAccept = r.Header.Get("Accept") + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + t.Cleanup(srv.Close) + + tr := NewHTTPTransport(srv.URL, "", nil) + tr.Headers = map[string]string{ + "Authorization": "Bearer secret", + "X-Scope-OrgID": "tenant-a", + } + + if _, err := NewClient(tr).Query(context.Background(), "up"); err != nil { + t.Fatalf("Query: %v", err) + } + if gotAuth != "Bearer secret" { + t.Errorf("Authorization = %q, want %q", gotAuth, "Bearer secret") + } + if gotTenant != "tenant-a" { + t.Errorf("X-Scope-OrgID = %q, want %q", gotTenant, "tenant-a") + } + if gotAccept != "application/json" { + t.Errorf("Accept = %q, want application/json", gotAccept) + } +} + +func TestHTTPTransport_HeadersOverrideAccept(t *testing.T) { + var gotAccept string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAccept = r.Header.Get("Accept") + _, _ = w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + t.Cleanup(srv.Close) + + tr := NewHTTPTransport(srv.URL, "", nil) + tr.Headers = map[string]string{"Accept": "application/vnd.custom+json"} + + if _, err := NewClient(tr).Query(context.Background(), "up"); err != nil { + t.Fatalf("Query: %v", err) + } + if gotAccept != "application/vnd.custom+json" { + t.Errorf("Accept = %q, want override", gotAccept) + } +} diff --git a/pkg/prom/types.go b/pkg/prom/types.go new file mode 100644 index 000000000..f3ab8e1d3 --- /dev/null +++ b/pkg/prom/types.go @@ -0,0 +1,59 @@ +// Package prom provides a Prometheus HTTP API client with a pluggable +// Transport so the same query, parsing, and discovery logic can be used +// from any context that can reach a Prometheus endpoint — directly, via +// kubectl port-forward, or through a tunneled proxy. +// +// The package is intentionally pure: no global state, no singletons, no +// k8s client dependency in the Client itself. K8s-aware discovery is a +// separate step that constructs a Transport. +package prom + +import "encoding/json" + +// ServiceInfo describes a Prometheus-compatible service discovered in the +// cluster. Used by discovery helpers and returned in Status. +type ServiceInfo struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + Port int `json:"port"` + BasePath string `json:"basePath,omitempty"` // e.g. "/select/0/prometheus" for vmselect +} + +// Status represents the current Prometheus connection status as exposed to +// callers/UI. Address is the effective URL (may be port-forwarded, a +// tunneled proxy URL, or a direct service URL depending on the Transport). +type Status struct { + Available bool `json:"available"` + Connected bool `json:"connected"` + Address string `json:"address,omitempty"` + Service *ServiceInfo `json:"service,omitempty"` + ContextName string `json:"contextName,omitempty"` + Error string `json:"error,omitempty"` +} + +// QueryResult is the parsed result of a Prometheus query. +type QueryResult struct { + ResultType string `json:"resultType"` + Series []Series `json:"series"` +} + +// Series is a single time series from a Prometheus query. +type Series struct { + Labels map[string]string `json:"labels"` + DataPoints []DataPoint `json:"dataPoints"` +} + +// DataPoint is a single (timestamp, value) pair. +type DataPoint struct { + Timestamp int64 `json:"timestamp"` + Value float64 `json:"value"` +} + +// promResponse is the raw shape returned by Prometheus HTTP API +// /api/v1/query and /api/v1/query_range endpoints. +type promResponse struct { + Status string `json:"status"` + Data json.RawMessage `json:"data"` + ErrorType string `json:"errorType,omitempty"` + Error string `json:"error,omitempty"` +} diff --git a/web/src/components/cost/CostView.tsx b/web/src/components/cost/CostView.tsx index 0a24f1023..ffcb9dc9e 100644 --- a/web/src/components/cost/CostView.tsx +++ b/web/src/components/cost/CostView.tsx @@ -417,7 +417,7 @@ function CostHelpDialog({ onClose }: { onClose: () => void }) { return (
-
+
{/* Header */}