Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,6 @@ func main() {
}
nova.NewAPI(filterWeigherController).Init(mux)

// Initialize commitments API for LIQUID interface
commitmentsAPI := commitments.NewAPI(multiclusterClient)
commitmentsAPI.Init(mux, metrics.Registry)

// Detector pipeline controller setup.
novaClient := nova.NewNovaClient()
novaClientConfig := conf.GetConfigOrDie[nova.NovaClientConfig]()
Expand All @@ -337,6 +333,12 @@ func main() {
setupLog.Error(err, "unable to initialize nova client")
os.Exit(1)
}

// Initialize commitments API for LIQUID interface (with Nova client for usage reporting)
commitmentsConfig := conf.GetConfigOrDie[commitments.Config]()
commitmentsAPI := commitments.NewAPIWithConfig(multiclusterClient, commitmentsConfig, novaClient)
commitmentsAPI.Init(mux, metrics.Registry)
Comment on lines +337 to +340
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don't gate the commitments HTTP API behind nova-pipeline-controllers.

This is the only commitmentsAPI.Init() in cmd/main.go, so deployments that do not enable that controller will stop serving /v1/commitments/... entirely. It also makes the Nova pipeline controller path hard-require commitments.Config via GetConfigOrDie(), even though the usage path already tolerates a nil Nova client.

Suggested direction
+var commitmentsNovaClient commitments.UsageNovaClient
+
 if slices.Contains(mainConfig.EnabledControllers, "nova-pipeline-controllers") {
   ...
-  commitmentsConfig := conf.GetConfigOrDie[commitments.Config]()
-  commitmentsAPI := commitments.NewAPIWithConfig(multiclusterClient, commitmentsConfig, novaClient)
-  commitmentsAPI.Init(mux, metrics.Registry)
+  commitmentsNovaClient = novaClient
   ...
 }
+
+commitmentsConfig := conf.GetConfigOrDie[commitments.Config]()
+commitmentsAPI := commitments.NewAPIWithConfig(multiclusterClient, commitmentsConfig, commitmentsNovaClient)
+commitmentsAPI.Init(mux, metrics.Registry)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cmd/main.go` around lines 336 - 339, The commitments HTTP API is currently
only initialized when the nova pipeline controller is enabled which removes
/v1/commitments when that controller is disabled and forces a required
commitments.Config via GetConfigOrDie; change this so
commitments.NewAPIWithConfig and commitmentsAPI.Init(mux, metrics.Registry) are
always called regardless of the nova-pipeline-controllers flag, obtain
commitments.Config in a non-fatal way (e.g., use a non-Die getter or nil/default
config instead of GetConfigOrDie) and pass nil for novaClient when the Nova
pipeline controller is not enabled so the API remains served while Nova
integration stays optional (adjust code around commitmentsConfig,
commitments.NewAPIWithConfig, and commitmentsAPI.Init).


deschedulingsController := &nova.DetectorPipelineController{
Monitor: detectorPipelineMonitor,
Breaker: &nova.DetectorCycleBreaker{NovaClient: novaClient},
Expand Down
97 changes: 97 additions & 0 deletions helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,100 @@ groups:
to be scheduled. Affected commitment changes are rolled back and Limes
will see them as failed. Consider investigating the scheduler performance
or increasing the timeout configuration.

# Committed Resource Usage API Alerts
- alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource usage API HTTP 400 errors too high"
description: >
The committed resource usage API (Limes LIQUID integration) is responding
with HTTP 4xx errors. This may indicate invalid project IDs or malformed
requests from Limes. Limes will typically retry these requests.

- alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource usage API HTTP 500 errors too high"
description: >
The committed resource usage API (Limes LIQUID integration) is responding
with HTTP 5xx errors. This indicates internal problems fetching reservations
or Nova server data. Limes may receive stale or incomplete usage data.

- alert: CortexNovaCommittedResourceUsageLatencyTooHigh
expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource usage API latency too high"
description: >
The committed resource usage API (Limes LIQUID integration) is experiencing
high latency (p95 > 5s). This may indicate slow Nova API responses or
database queries. Limes scrapes may time out, affecting quota reporting.

# Committed Resource Capacity API Alerts
- alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource capacity API HTTP 400 errors too high"
description: >
The committed resource capacity API (Limes LIQUID integration) is responding
with HTTP 4xx errors. This may indicate malformed requests from Limes.

- alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource capacity API HTTP 500 errors too high"
description: >
The committed resource capacity API (Limes LIQUID integration) is responding
with HTTP 5xx errors. This indicates internal problems calculating cluster
capacity. Limes may receive stale or incomplete capacity data.

- alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource capacity API latency too high"
description: >
The committed resource capacity API (Limes LIQUID integration) is experiencing
high latency (p95 > 5s). This may indicate slow database queries or knowledge
CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
4 changes: 4 additions & 0 deletions internal/scheduling/nova/deschedulings_executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ func (m *mockExecutorNovaClient) GetServerMigrations(ctx context.Context, id str
return []migration{}, nil
}

func (m *mockExecutorNovaClient) ListProjectServers(ctx context.Context, projectID string) ([]ServerDetail, error) {
return []ServerDetail{}, nil
}

func TestExecutor_Reconcile(t *testing.T) {
scheme := runtime.NewScheme()
err := v1alpha1.AddToScheme(scheme)
Expand Down
4 changes: 4 additions & 0 deletions internal/scheduling/nova/detector_cycle_breaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ func (m *mockDetectorCycleBreakerNovaClient) GetServerMigrations(ctx context.Con
return []migration{}, nil
}

func (m *mockDetectorCycleBreakerNovaClient) ListProjectServers(ctx context.Context, projectID string) ([]ServerDetail, error) {
return []ServerDetail{}, nil
}

func TestDetectorCycleBreaker_Filter(t *testing.T) {
tests := []struct {
name string
Expand Down
100 changes: 100 additions & 0 deletions internal/scheduling/nova/nova_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@ type migration struct {
DestCompute string `json:"dest_compute"`
}

// ServerDetail contains extended server information for usage reporting.
type ServerDetail struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
TenantID string `json:"tenant_id"`
Created string `json:"created"`
AvailabilityZone string `json:"OS-EXT-AZ:availability_zone"`
Hypervisor string `json:"OS-EXT-SRV-ATTR:hypervisor_hostname"`
FlavorName string // Populated from nested flavor.original_name
FlavorRAM uint64 // Populated from nested flavor.ram
FlavorVCPUs uint64 // Populated from nested flavor.vcpus
FlavorDisk uint64 // Populated from nested flavor.disk
}

type NovaClient interface {
// Initialize the Nova API with the Keystone authentication.
Init(ctx context.Context, client client.Client, conf NovaClientConfig) error
Expand All @@ -47,6 +62,8 @@ type NovaClient interface {
LiveMigrate(ctx context.Context, id string) error
// Get migrations for a server by ID.
GetServerMigrations(ctx context.Context, id string) ([]migration, error)
// List all servers for a project with detailed info.
ListProjectServers(ctx context.Context, projectID string) ([]ServerDetail, error)
}

type novaClient struct {
Expand Down Expand Up @@ -160,3 +177,86 @@ func (api *novaClient) GetServerMigrations(ctx context.Context, id string) ([]mi
slog.Info("fetched migrations for vm", "id", id, "count", len(migrations))
return migrations, nil
}

// ListProjectServers retrieves all servers for a project with detailed info.
func (api *novaClient) ListProjectServers(ctx context.Context, projectID string) ([]ServerDetail, error) {
// Build URL with pagination support
initialURL := api.sc.Endpoint + "servers/detail?all_tenants=true&tenant_id=" + projectID
var nextURL = &initialURL
var result []ServerDetail

for nextURL != nil {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, *nextURL, http.NoBody)
if err != nil {
return nil, err
}
req.Header.Set("X-Auth-Token", api.sc.Token())
req.Header.Set("X-OpenStack-Nova-API-Version", api.sc.Microversion)

resp, err := api.sc.HTTPClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
Comment on lines +196 to +200
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Resource leak: defer inside loop accumulates closers.

Using defer resp.Body.Close() inside a loop means all response bodies stay open until the function returns. For paginated responses with many pages, this can exhaust file descriptors.

🐛 Proposed fix: close body immediately after use
 		resp, err := api.sc.HTTPClient.Do(req)
 		if err != nil {
 			return nil, err
 		}
-		defer resp.Body.Close()
 
 		if resp.StatusCode != http.StatusOK {
+			resp.Body.Close()
 			return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
 		}
 
 		// ... decode logic ...
 
 		if err := json.NewDecoder(resp.Body).Decode(&list); err != nil {
+			resp.Body.Close()
 			return nil, err
 		}
+		resp.Body.Close()
 
 		// Convert to ServerDetail

Alternatively, extract the HTTP request logic into a helper function where defer would be scoped correctly.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/scheduling/nova/nova_client.go` around lines 196 - 200, The HTTP
response body is being deferred inside a loop (resp, req from
api.sc.HTTPClient.Do), causing many open bodies; change the code in the loop to
call resp.Body.Close() immediately after you're finished reading/parsing the
response (e.g., after io.ReadAll or json decoding) instead of deferring, or
refactor the request+read logic into a helper function (e.g., extract
DoRequest/parsePage helper) so that a defer resp.Body.Close() is scoped to that
helper and the body is closed before the next iteration; ensure resp is closed
on all error paths as well.


if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}

// Response structure with nested flavor
var list struct {
Servers []struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
TenantID string `json:"tenant_id"`
Created string `json:"created"`
AvailabilityZone string `json:"OS-EXT-AZ:availability_zone"`
Hypervisor string `json:"OS-EXT-SRV-ATTR:hypervisor_hostname"`
Flavor struct {
OriginalName string `json:"original_name"`
RAM uint64 `json:"ram"`
VCPUs uint64 `json:"vcpus"`
Disk uint64 `json:"disk"`
} `json:"flavor"`
} `json:"servers"`
Links []struct {
Rel string `json:"rel"`
Href string `json:"href"`
} `json:"servers_links"`
}

if err := json.NewDecoder(resp.Body).Decode(&list); err != nil {
return nil, err
}

// Convert to ServerDetail
for _, s := range list.Servers {
result = append(result, ServerDetail{
ID: s.ID,
Name: s.Name,
Status: s.Status,
TenantID: s.TenantID,
Created: s.Created,
AvailabilityZone: s.AvailabilityZone,
Hypervisor: s.Hypervisor,
FlavorName: s.Flavor.OriginalName,
FlavorRAM: s.Flavor.RAM,
FlavorVCPUs: s.Flavor.VCPUs,
FlavorDisk: s.Flavor.Disk,
})
}

// Check for next page
nextURL = nil
for _, link := range list.Links {
if link.Rel == "next" {
nextURL = &link.Href
break
}
}
}

slog.Info("fetched servers for project", "projectID", projectID, "count", len(result))
return result, nil
}
33 changes: 25 additions & 8 deletions internal/scheduling/reservations/commitments/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,54 @@
package commitments

import (
"context"
"net/http"
"sync"

"github.com/cobaltcore-dev/cortex/internal/scheduling/nova"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// UsageNovaClient is a minimal interface for the Nova client needed by the usage API.
// This allows for easy mocking in tests without implementing the full NovaClient interface.
type UsageNovaClient interface {
ListProjectServers(ctx context.Context, projectID string) ([]nova.ServerDetail, error)
}

// HTTPAPI implements Limes LIQUID commitment validation endpoints.
type HTTPAPI struct {
client client.Client
config Config
monitor ChangeCommitmentsAPIMonitor
client client.Client
config Config
novaClient UsageNovaClient
monitor ChangeCommitmentsAPIMonitor
usageMonitor ReportUsageAPIMonitor
capacityMonitor ReportCapacityAPIMonitor
// Mutex to serialize change-commitments requests
changeMutex sync.Mutex
}

func NewAPI(client client.Client) *HTTPAPI {
return NewAPIWithConfig(client, DefaultConfig())
return NewAPIWithConfig(client, DefaultConfig(), nil)
}
Comment on lines 34 to 36
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don’t publish report-usage from the nil-dependency constructor.

Line 35 still passes nil into NewAPIWithConfig(), but Line 56 always registers HandleReportUsage(). Since HandleReportUsage() constructs a UsageCalculator with api.novaClient in internal/scheduling/reservations/commitments/api_report_usage.go Lines 61-63, NewAPI() now creates an API instance that exposes an endpoint it cannot actually serve.

Minimal guard
 func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
 	registry.MustRegister(&api.monitor)
 	registry.MustRegister(&api.usageMonitor)
 	registry.MustRegister(&api.capacityMonitor)
 	mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
 	mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
 	mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
-	mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage
+	if api.novaClient != nil {
+		mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage
+	}
 }

A stronger fix would be to make NewAPI() require a UsageNovaClient as well, so the API cannot be constructed in an unusable state.

Also applies to: 49-56

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/scheduling/reservations/commitments/api.go` around lines 34 - 36,
NewAPI currently calls NewAPIWithConfig(client, DefaultConfig(), nil) which lets
HTTPAPI register HandleReportUsage even when api.novaClient is nil; update
NewAPI/NewAPIWithConfig usage so the report-usage handler is only registered
when a non-nil UsageNovaClient is provided (check api.novaClient before
registering HandleReportUsage in the constructor) or change NewAPI to require
and accept a UsageNovaClient parameter so an API cannot be created without the
dependency; refer to NewAPI, NewAPIWithConfig, HandleReportUsage,
UsageCalculator and api.novaClient when making the guard or signature change.


func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI {
func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaClient) *HTTPAPI {
return &HTTPAPI{
client: client,
config: config,
monitor: NewChangeCommitmentsAPIMonitor(),
client: client,
config: config,
novaClient: novaClient,
monitor: NewChangeCommitmentsAPIMonitor(),
usageMonitor: NewReportUsageAPIMonitor(),
capacityMonitor: NewReportCapacityAPIMonitor(),
}
}

func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
registry.MustRegister(&api.monitor)
registry.MustRegister(&api.usageMonitor)
registry.MustRegister(&api.capacityMonitor)
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage
}
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ func newCommitmentTestEnv(
// Use custom config if provided, otherwise use default
var api *HTTPAPI
if customConfig != nil {
api = NewAPIWithConfig(wrappedClient, *customConfig)
api = NewAPIWithConfig(wrappedClient, *customConfig, nil)
} else {
api = NewAPI(wrappedClient)
}
Expand Down
2 changes: 1 addition & 1 deletion internal/scheduling/reservations/commitments/api_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l
// Build resources map
resources := make(map[liquid.ResourceName]liquid.ResourceInfo)
for groupName, groupData := range flavorGroups {
resourceName := liquid.ResourceName("ram_" + groupName)
resourceName := liquid.ResourceName(commitmentResourceNamePrefix + groupName)

flavorNames := make([]string, 0, len(groupData.Flavors))
for _, flavor := range groupData.Flavors {
Expand Down
Loading
Loading