diff --git a/README.md b/README.md index dc7a3577..5624ea6c 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,12 @@ Hypeman can be configured using the following environment variables: | `DNS_SERVER` | DNS server IP address for VMs | `1.1.1.1` | | `MAX_CONCURRENT_BUILDS` | Maximum number of concurrent image builds | `1` | | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` | +| `ENV` | Deployment environment (filters telemetry, e.g. your name for dev) | `unset` | +| `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` | +| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `127.0.0.1:4317` | +| `OTEL_SERVICE_INSTANCE_ID` | Instance ID for telemetry (differentiates multiple servers) | hostname | +| `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` | +| `LOG_LEVEL_` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default | **Important: Subnet Configuration** @@ -159,6 +165,42 @@ make dev ``` The server will start on port 8080 (configurable via `PORT` environment variable). +#### Local OpenTelemetry (optional) + +To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir): + +```bash +# Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin) +# Note, if you are developing on a shared server, you can use the same LGTM stack as your peer(s) +# You will be able to sort your metrics, traces, and logs using the ENV configuration (see below) +docker run -d --name lgtm \ + -p 127.0.0.1:3000:3000 \ + -p 127.0.0.1:4317:4317 \ + -p 127.0.0.1:4318:4318 \ + -p 127.0.0.1:9090:9090 \ + -p 127.0.0.1:4040:4040 \ + grafana/otel-lgtm:latest + +# If developing on a remote server, forward the port to your local machine: +# ssh -L 3001:localhost:3000 your-server (then open http://localhost:3001) + +# Enable OTel in .env (set ENV to your name to filter your telemetry) +echo "OTEL_ENABLED=true" >> .env +echo "ENV=yourname" >> .env + +# Restart dev server +make dev +``` + +Open http://localhost:3000 to view traces (Tempo), metrics (Mimir), and logs (Loki) in Grafana. + +**Import the Hypeman dashboard:** +1. Go to Dashboards → New → Import +2. Upload `dashboards/hypeman.json` or paste its contents +3. Select the Prometheus datasource and click Import + +Use the Environment/Instance dropdowns to filter by `deployment.environment` or `service.instance.id`. + ### Testing Network tests require elevated permissions to create bridges and TAP devices. diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go index ab536869..9f010002 100644 --- a/cmd/api/api/api_test.go +++ b/cmd/api/api/api_test.go @@ -26,18 +26,18 @@ func newTestService(t *testing.T) *ApiService { } p := paths.New(cfg.DataDir) - imageMgr, err := images.NewManager(p, 1) + imageMgr, err := images.NewManager(p, 1, nil) if err != nil { t.Fatalf("failed to create image manager: %v", err) } systemMgr := system.NewManager(p) - networkMgr := network.NewManager(p, cfg) - volumeMgr := volumes.NewManager(p, 0) // 0 = unlimited storage + networkMgr := network.NewManager(p, cfg, nil) + volumeMgr := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := instances.ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB } - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil) // Register cleanup for orphaned Cloud Hypervisor processes t.Cleanup(func() { diff --git a/cmd/api/api/images.go b/cmd/api/api/images.go index 913b9918..68f4cc96 100644 --- a/cmd/api/api/images.go +++ b/cmd/api/api/images.go @@ -14,7 +14,7 @@ func (s *ApiService) ListImages(ctx context.Context, request oapi.ListImagesRequ domainImages, err := s.ImageManager.ListImages(ctx) if err != nil { - log.Error("failed to list images", "error", err) + log.ErrorContext(ctx, "failed to list images", "error", err) return oapi.ListImages500JSONResponse{ Code: "internal_error", Message: "failed to list images", @@ -50,7 +50,7 @@ func (s *ApiService) CreateImage(ctx context.Context, request oapi.CreateImageRe Message: "image not found", }, nil default: - log.Error("failed to create image", "error", err, "name", request.Body.Name) + log.ErrorContext(ctx, "failed to create image", "error", err, "name", request.Body.Name) return oapi.CreateImage500JSONResponse{ Code: "internal_error", Message: "failed to create image", @@ -72,7 +72,7 @@ func (s *ApiService) GetImage(ctx context.Context, request oapi.GetImageRequestO Message: "image not found", }, nil default: - log.Error("failed to get image", "error", err, "name", request.Name) + log.ErrorContext(ctx, "failed to get image", "error", err, "name", request.Name) return oapi.GetImage500JSONResponse{ Code: "internal_error", Message: "failed to get image", @@ -94,7 +94,7 @@ func (s *ApiService) DeleteImage(ctx context.Context, request oapi.DeleteImageRe Message: "image not found", }, nil default: - log.Error("failed to delete image", "error", err, "name", request.Name) + log.ErrorContext(ctx, "failed to delete image", "error", err, "name", request.Name) return oapi.DeleteImage500JSONResponse{ Code: "internal_error", Message: "failed to delete image", diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go index 2d84ac08..12f5bd7b 100644 --- a/cmd/api/api/instances.go +++ b/cmd/api/api/instances.go @@ -21,18 +21,18 @@ func (s *ApiService) ListInstances(ctx context.Context, request oapi.ListInstanc domainInsts, err := s.InstanceManager.ListInstances(ctx) if err != nil { - log.Error("failed to list instances", "error", err) + log.ErrorContext(ctx, "failed to list instances", "error", err) return oapi.ListInstances500JSONResponse{ Code: "internal_error", Message: "failed to list instances", }, nil } - + oapiInsts := make([]oapi.Instance, len(domainInsts)) for i, inst := range domainInsts { oapiInsts[i] = instanceToOAPI(inst) } - + return oapi.ListInstances200JSONResponse(oapiInsts), nil } @@ -160,7 +160,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst Message: err.Error(), }, nil default: - log.Error("failed to create instance", "error", err, "image", request.Body.Image) + log.ErrorContext(ctx, "failed to create instance", "error", err, "image", request.Body.Image) return oapi.CreateInstance500JSONResponse{ Code: "internal_error", Message: "failed to create instance", @@ -195,7 +195,7 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe Message: "multiple instances have this name, use instance ID instead", }, nil default: - log.Error("failed to get instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to get instance", "error", err, "id", request.Id) return oapi.GetInstance500JSONResponse{ Code: "internal_error", Message: "failed to get instance", @@ -235,7 +235,7 @@ func (s *ApiService) DeleteInstance(ctx context.Context, request oapi.DeleteInst Message: "instance not found", }, nil default: - log.Error("failed to delete instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to delete instance", "error", err, "id", request.Id) return oapi.DeleteInstance500JSONResponse{ Code: "internal_error", Message: "failed to delete instance", @@ -280,7 +280,7 @@ func (s *ApiService) StandbyInstance(ctx context.Context, request oapi.StandbyIn Message: err.Error(), }, nil default: - log.Error("failed to standby instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to standby instance", "error", err, "id", request.Id) return oapi.StandbyInstance500JSONResponse{ Code: "internal_error", Message: "failed to standby instance", @@ -325,7 +325,7 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn Message: err.Error(), }, nil default: - log.Error("failed to restore instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to restore instance", "error", err, "id", request.Id) return oapi.RestoreInstance500JSONResponse{ Code: "internal_error", Message: "failed to restore instance", @@ -417,18 +417,18 @@ func (s *ApiService) GetInstanceLogs(ctx context.Context, request oapi.GetInstan // AttachVolume attaches a volume to an instance (not yet implemented) func (s *ApiService) AttachVolume(ctx context.Context, request oapi.AttachVolumeRequestObject) (oapi.AttachVolumeResponseObject, error) { - return oapi.AttachVolume500JSONResponse{ + return oapi.AttachVolume500JSONResponse{ Code: "not_implemented", Message: "volume attachment not yet implemented", - }, nil + }, nil } // DetachVolume detaches a volume from an instance (not yet implemented) func (s *ApiService) DetachVolume(ctx context.Context, request oapi.DetachVolumeRequestObject) (oapi.DetachVolumeResponseObject, error) { - return oapi.DetachVolume500JSONResponse{ + return oapi.DetachVolume500JSONResponse{ Code: "not_implemented", Message: "volume detachment not yet implemented", - }, nil + }, nil } // instanceToOAPI converts domain Instance to OAPI Instance diff --git a/cmd/api/api/volumes.go b/cmd/api/api/volumes.go index 8b4afd88..6e903801 100644 --- a/cmd/api/api/volumes.go +++ b/cmd/api/api/volumes.go @@ -18,18 +18,18 @@ func (s *ApiService) ListVolumes(ctx context.Context, request oapi.ListVolumesRe domainVols, err := s.VolumeManager.ListVolumes(ctx) if err != nil { - log.Error("failed to list volumes", "error", err) + log.ErrorContext(ctx, "failed to list volumes", "error", err) return oapi.ListVolumes500JSONResponse{ Code: "internal_error", Message: "failed to list volumes", }, nil } - + oapiVols := make([]oapi.Volume, len(domainVols)) for i, vol := range domainVols { oapiVols[i] = volumeToOAPI(vol) } - + return oapi.ListVolumes200JSONResponse(oapiVols), nil } @@ -56,7 +56,7 @@ func (s *ApiService) CreateVolume(ctx context.Context, request oapi.CreateVolume Message: "volume with this ID already exists", }, nil } - log.Error("failed to create volume", "error", err, "name", request.JSONBody.Name) + log.ErrorContext(ctx, "failed to create volume", "error", err, "name", request.JSONBody.Name) return oapi.CreateVolume500JSONResponse{ Code: "internal_error", Message: "failed to create volume", @@ -171,7 +171,7 @@ func (s *ApiService) createVolumeFromMultipart(ctx context.Context, multipartRea Message: "volume with this ID already exists", }, nil } - log.Error("failed to create volume from archive", "error", err, "name", name) + log.ErrorContext(ctx, "failed to create volume from archive", "error", err, "name", name) return oapi.CreateVolume500JSONResponse{ Code: "internal_error", Message: "failed to create volume", @@ -222,7 +222,7 @@ func (s *ApiService) GetVolume(ctx context.Context, request oapi.GetVolumeReques Message: "multiple volumes have this name, use volume ID instead", }, nil default: - log.Error("failed to get volume", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to get volume", "error", err, "id", request.Id) return oapi.GetVolume500JSONResponse{ Code: "internal_error", Message: "failed to get volume", @@ -269,7 +269,7 @@ func (s *ApiService) DeleteVolume(ctx context.Context, request oapi.DeleteVolume Message: "volume is in use by an instance", }, nil default: - log.Error("failed to delete volume", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to delete volume", "error", err, "id", request.Id) return oapi.DeleteVolume500JSONResponse{ Code: "internal_error", Message: "failed to delete volume", @@ -302,5 +302,3 @@ func volumeToOAPI(vol volumes.Volume) oapi.Volume { return oapiVol } - - diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 0b59f6f0..18db2007 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -2,11 +2,52 @@ package config import ( "os" + "runtime/debug" "strconv" "github.com/joho/godotenv" ) +func getHostname() string { + if h, err := os.Hostname(); err == nil { + return h + } + return "unknown" +} + +// getBuildVersion extracts version info from Go's embedded build info. +// Returns git short hash + "-dirty" suffix if uncommitted changes, or "unknown" if unavailable. +func getBuildVersion() string { + info, ok := debug.ReadBuildInfo() + if !ok { + return "unknown" + } + + var revision string + var dirty bool + for _, s := range info.Settings { + switch s.Key { + case "vcs.revision": + revision = s.Value + case "vcs.modified": + dirty = s.Value == "true" + } + } + + if revision == "" { + return "unknown" + } + + // Use short hash (8 chars) + if len(revision) > 8 { + revision = revision[:8] + } + if dirty { + revision += "-dirty" + } + return revision +} + type Config struct { Port string DataDir string @@ -23,13 +64,25 @@ type Config struct { LogRotateInterval string // Resource limits - per instance - MaxVcpusPerInstance int // Max vCPUs for a single VM (0 = unlimited) - MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited) + MaxVcpusPerInstance int // Max vCPUs for a single VM (0 = unlimited) + MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited) // Resource limits - aggregate MaxTotalVcpus int // Aggregate vCPU limit across all instances (0 = unlimited) MaxTotalMemory string // Aggregate memory limit across all instances (0 = unlimited) MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited) + + // OpenTelemetry configuration + OtelEnabled bool // Enable OpenTelemetry + OtelEndpoint string // OTLP endpoint (gRPC) + OtelServiceName string // Service name for tracing + OtelServiceInstanceID string // Service instance ID (default: hostname) + OtelInsecure bool // Disable TLS for OTLP + Version string // Application version for telemetry + Env string // Deployment environment (e.g., dev, staging, prod) + + // Logging configuration + LogLevel string // Default log level (debug, info, warn, error) } // Load loads configuration from environment variables @@ -43,7 +96,7 @@ func Load() *Config { DataDir: getEnv("DATA_DIR", "/var/lib/hypeman"), BridgeName: getEnv("BRIDGE_NAME", "vmbr0"), SubnetCIDR: getEnv("SUBNET_CIDR", "10.100.0.0/16"), - SubnetGateway: getEnv("SUBNET_GATEWAY", ""), // empty = derived as first IP from subnet + SubnetGateway: getEnv("SUBNET_GATEWAY", ""), // empty = derived as first IP from subnet UplinkInterface: getEnv("UPLINK_INTERFACE", ""), // empty = auto-detect from default route JwtSecret: getEnv("JWT_SECRET", ""), DNSServer: getEnv("DNS_SERVER", "1.1.1.1"), @@ -61,6 +114,18 @@ func Load() *Config { MaxTotalVcpus: getEnvInt("MAX_TOTAL_VCPUS", 0), MaxTotalMemory: getEnv("MAX_TOTAL_MEMORY", ""), MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""), + + // OpenTelemetry configuration + OtelEnabled: getEnvBool("OTEL_ENABLED", false), + OtelEndpoint: getEnv("OTEL_ENDPOINT", "127.0.0.1:4317"), + OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"), + OtelServiceInstanceID: getEnv("OTEL_SERVICE_INSTANCE_ID", getHostname()), + OtelInsecure: getEnvBool("OTEL_INSECURE", true), + Version: getEnv("VERSION", getBuildVersion()), + Env: getEnv("ENV", "unset"), + + // Logging configuration + LogLevel: getEnv("LOG_LEVEL", "info"), } return cfg @@ -82,3 +147,11 @@ func getEnvInt(key string, defaultValue int) int { return defaultValue } +func getEnvBool(key string, defaultValue bool) bool { + if value := os.Getenv(key); value != "" { + if boolVal, err := strconv.ParseBool(value); err == nil { + return boolVal + } + } + return defaultValue +} diff --git a/cmd/api/main.go b/cmd/api/main.go index d2ad80f2..3951d7b7 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -9,6 +9,7 @@ import ( "net/http" "os" "os/signal" + "strings" "syscall" "time" @@ -20,9 +21,14 @@ import ( nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware" "github.com/onkernel/hypeman" "github.com/onkernel/hypeman/cmd/api/api" + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/exec" "github.com/onkernel/hypeman/lib/instances" mw "github.com/onkernel/hypeman/lib/middleware" "github.com/onkernel/hypeman/lib/oapi" + "github.com/onkernel/hypeman/lib/otel" + "github.com/onkernel/hypeman/lib/vmm" + "github.com/riandyrn/otelchi" "golang.org/x/sync/errgroup" ) @@ -34,6 +40,52 @@ func main() { } func run() error { + // Load config early for OTel initialization + cfg := config.Load() + + // Initialize OpenTelemetry (before wire initialization) + otelCfg := otel.Config{ + Enabled: cfg.OtelEnabled, + Endpoint: cfg.OtelEndpoint, + ServiceName: cfg.OtelServiceName, + ServiceInstanceID: cfg.OtelServiceInstanceID, + Insecure: cfg.OtelInsecure, + Version: cfg.Version, + Env: cfg.Env, + } + + otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg) + if err != nil { + // Log warning but don't fail - graceful degradation + slog.Warn("failed to initialize OpenTelemetry, continuing without telemetry", "error", err) + } + if otelShutdown != nil { + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := otelShutdown(shutdownCtx); err != nil { + slog.Warn("error shutting down OpenTelemetry", "error", err) + } + }() + } + + // Initialize exec and vmm metrics if OTel is enabled + if otelProvider != nil && otelProvider.Meter != nil { + execMetrics, err := exec.NewMetrics(otelProvider.Meter) + if err == nil { + exec.SetMetrics(execMetrics) + } + vmmMetrics, err := vmm.NewMetrics(otelProvider.Meter) + if err == nil { + vmm.SetMetrics(vmmMetrics) + } + } + + // Set global OTel log handler for logger package + if otelProvider != nil && otelProvider.LogHandler != nil { + otel.SetGlobalLogHandler(otelProvider.LogHandler) + } + // Initialize app with wire app, cleanup, err := initializeApp() if err != nil { @@ -46,6 +98,11 @@ func run() error { logger := app.Logger + // Log OTel status + if cfg.OtelEnabled { + logger.Info("OpenTelemetry enabled", "endpoint", cfg.OtelEndpoint, "service", cfg.OtelServiceName) + } + // Validate JWT secret is configured if app.Config.JwtSecret == "" { logger.Warn("JWT_SECRET not configured - API authentication will fail") @@ -84,6 +141,23 @@ func run() error { // Create router r := chi.NewRouter() + // Prepare HTTP metrics middleware (applied inside API group, not globally) + // Global application breaks WebSocket (Hijacker) and SSE (Flusher) + var httpMetricsMw func(http.Handler) http.Handler + if otelProvider != nil && otelProvider.Meter != nil { + httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter) + if err == nil { + httpMetricsMw = httpMetrics.Middleware + } + } + + // Create access logger with OTel handler for HTTP request logging with trace correlation + var accessLogHandler slog.Handler + if otelProvider != nil { + accessLogHandler = otelProvider.LogHandler + } + accessLogger := mw.NewAccessLogger(accessLogHandler) + // Load OpenAPI spec for request validation spec, err := oapi.GetSwagger() if err != nil { @@ -95,11 +169,12 @@ func run() error { spec.Servers = nil // Custom exec endpoint (outside OpenAPI spec, uses WebSocket) + // Note: No otelchi here as WebSocket doesn't work well with tracing middleware r.With( middleware.RequestID, middleware.RealIP, - middleware.Logger, middleware.Recoverer, + mw.AccessLogger(accessLogger), mw.JwtAuth(app.Config.JwtSecret), ).Get("/instances/{id}/exec", app.ApiService.ExecHandler) @@ -108,8 +183,31 @@ func run() error { // Common middleware r.Use(middleware.RequestID) r.Use(middleware.RealIP) - r.Use(middleware.Logger) r.Use(middleware.Recoverer) + + // OpenTelemetry tracing middleware FIRST (creates span context) + if cfg.OtelEnabled { + r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r))) + } + + // Inject logger into request context for handlers to use + r.Use(mw.InjectLogger(accessLogger)) + + // Access logger AFTER otelchi so trace context is available + r.Use(mw.AccessLogger(accessLogger)) + if httpMetricsMw != nil { + // Skip HTTP metrics for SSE streaming endpoints (logs) + r.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/logs") { + next.ServeHTTP(w, r) + return + } + httpMetricsMw(next).ServeHTTP(w, r) + }) + }) + } + r.Use(middleware.Timeout(60 * time.Second)) // OpenAPI request validation with authentication @@ -225,4 +323,3 @@ func getRunningInstanceIDs(app *application) []string { } return running } - diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json new file mode 100644 index 00000000..c2c46c55 --- /dev/null +++ b/dashboards/hypeman.json @@ -0,0 +1,470 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "gridPos": { "h": 3, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_uptime_seconds{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 3, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(hypeman_instances_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "title": "Instances", + "type": "stat" + }, + { + "gridPos": { "h": 3, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "refId": "A" + } + ], + "title": "Volumes", + "type": "stat" + }, + { + "gridPos": { "h": 3, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(hypeman_images_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Images", + "type": "stat" + }, + { + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, + "id": 5, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_http_request_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m]) / rate(hypeman_http_request_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "legendFormat": "{{http_request_method}} {{http_route}}", + "refId": "A" + } + ], + "title": "HTTP Request Latency (avg)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_http_requests_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}", + "refId": "A" + } + ], + "title": "HTTP Requests/sec", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "reqps" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 11 }, + "id": 7, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_instances_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg create", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_instances_restore_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_restore_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg restore", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_instances_standby_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_standby_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg standby", + "refId": "C" + } + ], + "title": "Instance Operation Latency (avg)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 11 }, + "id": 8, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_allocated_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "allocated", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "used", + "refId": "B" + } + ], + "title": "Volume Storage", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "bytes" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 11 }, + "id": 9, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "sent (stdin)", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "received (stdout+stderr)", + "refId": "B" + } + ], + "title": "Exec Bytes (cumulative)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "bytes" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 19 }, + "id": 10, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_images_build_queue_length{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "queue length", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "pulls ({{status}})", + "refId": "B" + } + ], + "title": "Image Build Queue & Pulls", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 19 }, + "id": 11, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_vmm_api_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_vmm_api_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg {{operation}}", + "refId": "A" + } + ], + "title": "VMM API Latency (avg)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 19 }, + "id": 12, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_network_allocations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "IP allocations", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "TAP ops ({{operation}})", + "refId": "B" + } + ], + "title": "Network", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }, + "id": 15, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_images_build_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_images_build_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg build", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_volumes_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg volume create", + "refId": "B" + } + ], + "title": "Image Build & Volume Create (avg)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }, + "id": 16, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "{{from}} → {{to}}", + "refId": "A" + } + ], + "title": "Instance State Transitions", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }, + "id": 13, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_memory_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "memory used", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_memory_gc_goal_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "GC goal", + "refId": "B" + } + ], + "title": "Go Memory", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "bytes" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 }, + "id": 14, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_goroutine_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "goroutines", + "refId": "A" + } + ], + "title": "Goroutines", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 }, + "id": 17, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "sessions ({{status}}, exit={{exit_code}})", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_exec_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_exec_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg duration (s)", + "refId": "B" + } + ], + "title": "Exec Sessions & Duration", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "tags": ["hypeman"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(hypeman_uptime_seconds, deployment_environment_name)", + "hide": 0, + "includeAll": true, + "label": "Environment", + "multi": true, + "name": "env", + "options": [], + "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "refId": "A" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(hypeman_uptime_seconds, service_instance_id)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { "query": "label_values(hypeman_uptime_seconds, service_instance_id)", "refId": "A" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Hypeman", + "uid": "hypeman-overview", + "version": 2, + "weekStart": "" +} diff --git a/go.mod b/go.mod index 0f4f6cb0..210a679a 100644 --- a/go.mod +++ b/go.mod @@ -22,10 +22,19 @@ require ( github.com/opencontainers/image-spec v1.1.1 github.com/opencontainers/runtime-spec v1.2.1 github.com/opencontainers/umoci v0.6.0 + github.com/riandyrn/otelchi v0.12.2 github.com/samber/lo v1.52.0 github.com/stretchr/testify v1.11.1 github.com/u-root/u-root v0.15.0 github.com/vishvananda/netlink v1.3.1 + go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/metric v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 golang.org/x/sync v0.17.0 golang.org/x/sys v0.38.0 golang.org/x/term v0.37.0 @@ -39,17 +48,22 @@ require ( github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/apex/log v1.9.0 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/docker/cli v28.2.2+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect github.com/dustin/go-humanize v1.0.1 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-test/deep v1.1.1 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/pgzip v1.2.6 // indirect @@ -66,7 +80,6 @@ require ( github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect github.com/sirupsen/logrus v1.9.4-0.20230606125235-dd1b4c2e81af // indirect github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 // indirect @@ -74,9 +87,17 @@ require ( github.com/vbatts/tar-split v0.12.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/woodsbury/decimal128 v1.3.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/log v0.14.0 // indirect + go.opentelemetry.io/otel/sdk/log v0.14.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index ac155882..e1226b84 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 h1:6lhrsTEnloDPXyeZBvSYvQf8u86jbKehZPVDDlkgDl4= github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8= github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= @@ -36,6 +38,8 @@ github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/getkin/kin-openapi v0.133.0 h1:pJdmNohVIJ97r4AUFtEXRXwESr8b0bD721u/Tz6k8PQ= github.com/getkin/kin-openapi v0.133.0/go.mod h1:boAciF6cXk5FhPqe/NQeBTeenbjqU4LhWBf09ILVvWE= @@ -44,6 +48,7 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME github.com/go-chi/chi/v5 v5.2.3 h1:WQIt9uxdsAbgIYgid+BpYc+liqQZGMHRaUwp0JUcvdE= github.com/go-chi/chi/v5 v5.2.3/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -75,6 +80,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= @@ -144,9 +151,11 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/riandyrn/otelchi v0.12.2 h1:6QhGv0LVw/dwjtPd12mnNrl0oEQF4ZAlmHcnlTYbeAg= +github.com/riandyrn/otelchi v0.12.2/go.mod h1:weZZeUJURvtCcbWsdb7Y6F8KFZGedJlSrgUjq9VirV8= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 h1:58jvc5cZ+hGKidQ4Z37/+rj9eQxRRjOOsqNEwPSZXR4= github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67/go.mod h1:LLjEAc6zmycfeN7/1fxIphWQPjHpTt7ElqT7eVf8e4A= github.com/samber/lo v1.52.0 h1:Rvi+3BFHES3A8meP33VPAxiBZX/Aws5RxrschYGjomw= @@ -189,16 +198,36 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 h1:bwnLpizECbPr1RrQ27waeY2SPIPeccCx/xLuoYADZ9s= +go.opentelemetry.io/contrib/bridges/otelslog v0.13.0/go.mod h1:3nWlOiiqA9UtUnrcNk82mYasNxD8ehOspL0gOfEo6Y4= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 h1:OMqPldHt79PqWKOMYIAQs3CxAi7RLgPxwfFSwr4ZxtM= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0/go.mod h1:1biG4qiqTxKiUCtoWDPpL3fB3KxVwCiGw81j3nKMuHE= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/log v0.14.0 h1:2rzJ+pOAZ8qmZ3DDHg73NEKzSZkhkGIua9gXtxNGgrM= +go.opentelemetry.io/otel/log v0.14.0/go.mod h1:5jRG92fEAgx0SU/vFPxmJvhIuDU9E1SUnEQrMlJpOno= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/log v0.14.0 h1:JU/U3O7N6fsAXj0+CXz21Czg532dW2V4gG1HE/e8Zrg= +go.opentelemetry.io/otel/sdk/log v0.14.0/go.mod h1:imQvII+0ZylXfKU7/wtOND8Hn4OpT3YUoIgqJVksUkM= go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= @@ -230,6 +259,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo= google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 h1:M1rk8KBnUsBDg1oPGHNCxG4vc1f49epmTO7xscSajMk= google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= diff --git a/lib/exec/client.go b/lib/exec/client.go index 0484bc96..a32e6338 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -9,6 +9,7 @@ import ( "net" "strings" "sync" + "sync/atomic" "time" "google.golang.org/grpc" @@ -111,6 +112,9 @@ func (c *bufferedConn) Read(p []byte) (int, error) { // ExecIntoInstance executes command in instance via vsock using gRPC // vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock) func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) { + start := time.Now() + var bytesSent int64 + // Get or create a reusable gRPC connection for this vsock socket // Connection pooling avoids issues with rapid connect/disconnect cycles grpcConn, err := getOrCreateConn(ctx, vsockSocketPath) @@ -146,13 +150,14 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti // Handle stdin in background if opts.Stdin != nil { go func() { - buf := make([]byte, 32 * 1024) + buf := make([]byte, 32*1024) for { n, err := opts.Stdin.Read(buf) if n > 0 { stream.Send(&ExecRequest{ Request: &ExecRequest_Stdin{Stdin: buf[:n]}, }) + atomic.AddInt64(&bytesSent, int64(n)) } if err != nil { stream.CloseSend() @@ -185,14 +190,20 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti opts.Stderr.Write(r.Stderr) } case *ExecResponse_ExitCode: - return &ExitStatus{Code: int(r.ExitCode)}, nil + exitCode := int(r.ExitCode) + // Record metrics + if ExecMetrics != nil { + bytesReceived := int64(totalStdout + totalStderr) + ExecMetrics.RecordSession(ctx, start, exitCode, atomic.LoadInt64(&bytesSent), bytesReceived) + } + return &ExitStatus{Code: exitCode}, nil } } } // dialVsock connects to Cloud Hypervisor's vsock Unix socket and performs the handshake func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { - slog.Debug("connecting to vsock", "socket", vsockSocketPath) + slog.DebugContext(ctx, "connecting to vsock", "socket", vsockSocketPath) // Use dial timeout, respecting context deadline if shorter dialTimeout := vsockDialTimeout @@ -209,7 +220,7 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { return nil, fmt.Errorf("dial vsock socket %s: %w", vsockSocketPath, err) } - slog.Debug("connected to vsock socket, performing handshake", "port", vsockGuestPort) + slog.DebugContext(ctx, "connected to vsock socket, performing handshake", "port", vsockGuestPort) // Set deadline for handshake if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil { @@ -244,10 +255,9 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { return nil, fmt.Errorf("vsock handshake failed: %s", response) } - slog.Debug("vsock handshake successful", "response", response) + slog.DebugContext(ctx, "vsock handshake successful", "response", response) // Return wrapped connection that uses the bufio.Reader // This ensures any bytes buffered during handshake are not lost return &bufferedConn{Conn: conn, reader: reader}, nil } - diff --git a/lib/exec/metrics.go b/lib/exec/metrics.go new file mode 100644 index 00000000..ef1c83c7 --- /dev/null +++ b/lib/exec/metrics.go @@ -0,0 +1,105 @@ +package exec + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for exec operations. +type Metrics struct { + sessionsTotal metric.Int64Counter + duration metric.Float64Histogram + bytesSentTotal metric.Int64Counter + bytesReceivedTotal metric.Int64Counter +} + +// ExecMetrics is the global metrics instance for the exec package. +// Set this via SetMetrics() during application initialization. +var ExecMetrics *Metrics + +// SetMetrics sets the global metrics instance. +func SetMetrics(m *Metrics) { + ExecMetrics = m +} + +// NewMetrics creates exec metrics instruments. +// If meter is nil, returns nil (metrics disabled). +func NewMetrics(meter metric.Meter) (*Metrics, error) { + if meter == nil { + return nil, nil + } + + sessionsTotal, err := meter.Int64Counter( + "hypeman_exec_sessions_total", + metric.WithDescription("Total number of exec sessions"), + ) + if err != nil { + return nil, err + } + + duration, err := meter.Float64Histogram( + "hypeman_exec_duration_seconds", + metric.WithDescription("Exec command duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + bytesSentTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_sent_total", + metric.WithDescription("Total bytes sent to guest (stdin)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + bytesReceivedTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_received_total", + metric.WithDescription("Total bytes received from guest (stdout+stderr)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + return &Metrics{ + sessionsTotal: sessionsTotal, + duration: duration, + bytesSentTotal: bytesSentTotal, + bytesReceivedTotal: bytesReceivedTotal, + }, nil +} + +// RecordSession records metrics for a completed exec session. +func (m *Metrics) RecordSession(ctx context.Context, start time.Time, exitCode int, bytesSent, bytesReceived int64) { + if m == nil { + return + } + + duration := time.Since(start).Seconds() + status := "success" + if exitCode != 0 { + status = "error" + } + + m.sessionsTotal.Add(ctx, 1, + metric.WithAttributes( + attribute.String("status", status), + attribute.Int("exit_code", exitCode), + )) + + m.duration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) + + if bytesSent > 0 { + m.bytesSentTotal.Add(ctx, bytesSent) + } + if bytesReceived > 0 { + m.bytesReceivedTotal.Add(ctx, bytesReceived) + } +} diff --git a/lib/images/manager.go b/lib/images/manager.go index 8d6a403b..b637bce5 100644 --- a/lib/images/manager.go +++ b/lib/images/manager.go @@ -10,6 +10,7 @@ import ( "time" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/metric" ) const ( @@ -33,10 +34,12 @@ type manager struct { ociClient *ociClient queue *BuildQueue createMu sync.Mutex + metrics *Metrics } -// NewManager creates a new image manager -func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) { +// NewManager creates a new image manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Manager, error) { // Create cache directory under dataDir for OCI layouts cacheDir := p.SystemOCICache() ociClient, err := newOCIClient(cacheDir) @@ -49,6 +52,16 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) { ociClient: ociClient, queue: NewBuildQueue(maxConcurrentBuilds), } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newMetrics(meter, m) + if err != nil { + return nil, fmt.Errorf("create metrics: %w", err) + } + m.metrics = metrics + } + m.RecoverInterruptedBuilds() return m, nil } @@ -78,7 +91,7 @@ func (m *manager) CreateImage(ctx context.Context, req CreateImageRequest) (*Ima // Add a 2-second timeout to ensure fast failure on rate limits or errors resolveCtx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() - + ref, err := normalized.Resolve(resolveCtx, m.ociClient) if err != nil { return nil, fmt.Errorf("resolve manifest: %w", err) @@ -134,11 +147,13 @@ func (m *manager) createAndQueueImage(ref *ResolvedRef) (*Image, error) { } func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { + buildStart := time.Now() buildDir := m.paths.SystemBuild(ref.String()) tempDir := filepath.Join(buildDir, "rootfs") if err := os.MkdirAll(buildDir, 0755); err != nil { m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("create build dir: %w", err)) + m.recordBuildMetrics(ctx, buildStart, "failed") return } @@ -153,8 +168,11 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { result, err := m.ociClient.pullAndExport(ctx, ref.String(), ref.Digest(), tempDir) if err != nil { m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("pull and export: %w", err)) + m.recordPullMetrics(ctx, "failed") + m.recordBuildMetrics(ctx, buildStart, "failed") return } + m.recordPullMetrics(ctx, "success") // Check if this digest already exists and is ready (deduplication) if meta, err := readMetadata(m.paths, ref.Repository(), ref.DigestHex()); err == nil { @@ -209,6 +227,8 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { fmt.Fprintf(os.Stderr, "Warning: failed to create tag symlink: %v\n", err) } } + + m.recordBuildMetrics(ctx, buildStart, "success") } func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err error) { diff --git a/lib/images/manager_test.go b/lib/images/manager_test.go index 984a7ba3..406566ef 100644 --- a/lib/images/manager_test.go +++ b/lib/images/manager_test.go @@ -14,7 +14,7 @@ import ( func TestCreateImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -41,7 +41,7 @@ func TestCreateImage(t *testing.T) { ref, err := ParseNormalizedRef(img.Name) require.NoError(t, err) digestHex := strings.SplitN(img.Digest, ":", 2)[1] - + // Check erofs disk file diskPath := digestPath(paths.New(dataDir), ref.Repository(), digestHex) diskStat, err := os.Stat(diskPath) @@ -56,7 +56,7 @@ func TestCreateImage(t *testing.T) { metaStat, err := os.Stat(metadataPath) require.NoError(t, err) require.False(t, metaStat.IsDir(), "metadata should be a file") - + // Read and verify metadata content meta, err := readMetadata(paths.New(dataDir), ref.Repository(), digestHex) require.NoError(t, err) @@ -66,7 +66,7 @@ func TestCreateImage(t *testing.T) { require.Nil(t, meta.Error) require.Equal(t, diskStat.Size(), meta.SizeBytes) require.NotEmpty(t, meta.Env, "should have environment variables") - t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d", + t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d", meta.Name, meta.Digest, meta.Status, len(meta.Env)) // Check that tag symlink exists and points to correct digest @@ -74,7 +74,7 @@ func TestCreateImage(t *testing.T) { linkStat, err := os.Lstat(linkPath) require.NoError(t, err) require.NotEqual(t, 0, linkStat.Mode()&os.ModeSymlink, "should be a symlink") - + // Verify symlink points to digest directory linkTarget, err := os.Readlink(linkPath) require.NoError(t, err) @@ -84,7 +84,7 @@ func TestCreateImage(t *testing.T) { func TestCreateImageDifferentTag(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -98,7 +98,7 @@ func TestCreateImageDifferentTag(t *testing.T) { require.Equal(t, "docker.io/library/alpine:3.18", img.Name) waitForReady(t, mgr, ctx, img.Name) - + img, err = mgr.GetImage(ctx, img.Name) require.NoError(t, err) require.NotEmpty(t, img.Digest) @@ -106,7 +106,7 @@ func TestCreateImageDifferentTag(t *testing.T) { func TestCreateImageDuplicate(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -135,7 +135,7 @@ func TestCreateImageDuplicate(t *testing.T) { func TestListImages(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -164,7 +164,7 @@ func TestListImages(t *testing.T) { func TestGetImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -188,7 +188,7 @@ func TestGetImage(t *testing.T) { func TestGetImageNotFound(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -199,7 +199,7 @@ func TestGetImageNotFound(t *testing.T) { func TestDeleteImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -234,7 +234,7 @@ func TestDeleteImage(t *testing.T) { func TestDeleteImageNotFound(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -271,7 +271,7 @@ func TestNormalizedRefParsing(t *testing.T) { func TestLayerCaching(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -326,7 +326,7 @@ func TestLayerCaching(t *testing.T) { digestHex := strings.TrimPrefix(alpine1.Digest, "sha256:") disk1 := digestPath(paths.New(dataDir), alpine1Parsed.Repository(), digestHex) disk2 := digestPath(paths.New(dataDir), alpine2Parsed.Repository(), digestHex) - + require.Equal(t, disk1, disk2, "both references should point to same disk") stat, err := os.Stat(disk1) diff --git a/lib/images/metrics.go b/lib/images/metrics.go new file mode 100644 index 00000000..08334ef2 --- /dev/null +++ b/lib/images/metrics.go @@ -0,0 +1,103 @@ +package images + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for image operations. +type Metrics struct { + buildDuration metric.Float64Histogram + pullsTotal metric.Int64Counter +} + +// newMetrics creates and registers all image metrics. +func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + buildDuration, err := meter.Float64Histogram( + "hypeman_images_build_duration_seconds", + metric.WithDescription("Time to build an image"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + pullsTotal, err := meter.Int64Counter( + "hypeman_images_pulls_total", + metric.WithDescription("Total number of image pulls from registries"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges for queue length and total images + buildQueueLength, err := meter.Int64ObservableGauge( + "hypeman_images_build_queue_length", + metric.WithDescription("Current number of images in the build queue"), + ) + if err != nil { + return nil, err + } + + imagesTotal, err := meter.Int64ObservableGauge( + "hypeman_images_total", + metric.WithDescription("Total number of cached images"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + // Report queue length + o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength())) + + // Count images by status + metas, err := listAllTags(m.paths) + if err != nil { + return nil + } + statusCounts := make(map[string]int64) + for _, meta := range metas { + statusCounts[meta.Status]++ + } + for status, count := range statusCounts { + o.ObserveInt64(imagesTotal, count, + metric.WithAttributes(attribute.String("status", status))) + } + return nil + }, + buildQueueLength, + imagesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + buildDuration: buildDuration, + pullsTotal: pullsTotal, + }, nil +} + +// recordBuildMetrics records the build duration metric. +func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.buildDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordPullMetrics records the pull counter metric. +func (m *manager) recordPullMetrics(ctx context.Context, status string) { + if m.metrics == nil { + return + } + m.metrics.pullsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("status", status))) +} diff --git a/lib/images/queue.go b/lib/images/queue.go index 115d267c..5d08100a 100644 --- a/lib/images/queue.go +++ b/lib/images/queue.go @@ -113,3 +113,9 @@ func (q *BuildQueue) PendingCount() int { return len(q.pending) } +// QueueLength returns the total number of builds (active + pending) +func (q *BuildQueue) QueueLength() int { + q.mu.Lock() + defer q.mu.Unlock() + return len(q.active) + len(q.pending) +} diff --git a/lib/instances/create.go b/lib/instances/create.go index faa32279..b82e421d 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -15,6 +15,7 @@ import ( "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/vmm" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel/trace" "gvisor.dev/gvisor/pkg/cleanup" ) @@ -93,9 +94,17 @@ func (m *manager) createInstance( ctx context.Context, req CreateInstanceRequest, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "creating instance", "name", req.Name, "image", req.Image, "vcpus", req.Vcpus) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "CreateInstance") + defer span.End() + } + // 1. Validate request if err := validateCreateRequest(req); err != nil { log.ErrorContext(ctx, "invalid create request", "error", err) @@ -336,6 +345,12 @@ func (m *manager) createInstance( // Success - release cleanup stack (prevent cleanup) cu.Release() + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.createDuration, start, "success") + m.recordStateTransition(ctx, "stopped", string(StateRunning)) + } + // Return instance with derived state finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance created successfully", "id", id, "name", req.Name, "state", finalInst.State) @@ -452,14 +467,14 @@ func (m *manager) startAndBootVM( netConfig *network.NetworkConfig, ) error { log := logger.FromContext(ctx) - + // Start VMM process and capture PID log.DebugContext(ctx, "starting VMM process", "id", stored.Id, "version", stored.CHVersion) pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath) if err != nil { return fmt.Errorf("start vmm: %w", err) } - + // Store the PID for later cleanup stored.CHPID = &pid log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid) @@ -539,7 +554,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf BootVcpus: inst.Vcpus, MaxVcpus: inst.Vcpus, } - + // Calculate and set guest topology based on host topology if topology := calculateGuestTopology(inst.Vcpus, m.hostTopology); topology != nil { cpus.Topology = topology @@ -551,7 +566,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf } if inst.HotplugSize > 0 { memory.HotplugSize = &inst.HotplugSize - memory.HotplugMethod = ptr("VirtioMem") // PascalCase, not kebab-case + memory.HotplugMethod = ptr("VirtioMem") // PascalCase, not kebab-case } // Disk configuration @@ -644,4 +659,3 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf func ptr[T any](v T) *T { return &v } - diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index ef0ba8f0..cee4e71b 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -46,7 +46,7 @@ func TestExecConcurrent(t *testing.T) { p := paths.New(tmpDir) // Setup image - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling nginx:alpine image...") @@ -215,4 +215,3 @@ func TestExecConcurrent(t *testing.T) { t.Logf("Long-running streams completed in %v (concurrent OK)", streamElapsed) } - diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 0c939a5c..a7050578 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -10,6 +10,8 @@ import ( "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" ) type Manager interface { @@ -44,11 +46,13 @@ type manager struct { limits ResourceLimits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks hostTopology *HostTopology // Cached host CPU topology + metrics *Metrics } -// NewManager creates a new instances manager -func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits) Manager { - return &manager{ +// NewManager creates a new instances manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits, meter metric.Meter, tracer trace.Tracer) Manager { + m := &manager{ paths: p, imageManager: imageManager, systemManager: systemManager, @@ -58,6 +62,16 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste instanceLocks: sync.Map{}, hostTopology: detectHostTopology(), // Detect and cache host topology } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newInstanceMetrics(meter, tracer, m) + if err == nil { + m.metrics = metrics + } + } + + return m } // getInstanceLock returns or creates a lock for a specific instance @@ -81,7 +95,7 @@ func (m *manager) DeleteInstance(ctx context.Context, id string) error { lock := m.getInstanceLock(id) lock.Lock() defer lock.Unlock() - + err := m.deleteInstance(ctx, id) if err == nil { // Clean up the lock after successful deletion diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 4b533e0a..8534cdf4 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -26,21 +26,21 @@ import ( // setupTestManager creates a manager and registers cleanup for any orphaned processes func setupTestManager(t *testing.T) (*manager, string) { tmpDir := t.TempDir() - + cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", + DataDir: tmpDir, + BridgeName: "vmbr0", SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", + DNSServer: "1.1.1.1", } - + p := paths.New(tmpDir) - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) - + systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited @@ -48,20 +48,20 @@ func setupTestManager(t *testing.T) (*manager, string) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) - + mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) + // Register cleanup to kill any orphaned Cloud Hypervisor processes t.Cleanup(func() { cleanupOrphanedProcesses(t, mgr) }) - + return mgr, tmpDir } // waitForVMReady polls VM state via VMM API until it's running or times out func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duration) error { deadline := time.Now().Add(timeout) - + for time.Now().Before(deadline) { // Try to connect to VMM client, err := vmm.NewVMM(socketPath) @@ -70,48 +70,48 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio time.Sleep(100 * time.Millisecond) continue } - + // Get VM info infoResp, err := client.GetVmInfoWithResponse(ctx) if err != nil { time.Sleep(100 * time.Millisecond) continue } - + if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { time.Sleep(100 * time.Millisecond) continue } - + // Check if VM is running if infoResp.JSON200.State == vmm.Running { return nil } - + time.Sleep(100 * time.Millisecond) } - + return fmt.Errorf("VM did not reach running state within %v", timeout) } // waitForLogMessage polls instance logs until the message appears or times out func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error { deadline := time.Now().Add(timeout) - + for time.Now().Before(deadline) { logs, err := collectLogs(ctx, mgr, instanceID, 200) if err != nil { time.Sleep(100 * time.Millisecond) continue } - + if strings.Contains(logs, message) { return nil } - + time.Sleep(100 * time.Millisecond) } - + return fmt.Errorf("message %q not found in logs within %v", message, timeout) } @@ -121,12 +121,12 @@ func collectLogs(ctx context.Context, mgr *manager, instanceID string, n int) (s if err != nil { return "", err } - + var lines []string for line := range logChan { lines = append(lines, line) } - + return strings.Join(lines, "\n"), nil } @@ -137,26 +137,26 @@ func cleanupOrphanedProcesses(t *testing.T, mgr *manager) { if err != nil { return // No metadata files, nothing to clean } - + for _, metaFile := range metaFiles { // Extract instance ID from path id := filepath.Base(filepath.Dir(metaFile)) - + // Load metadata meta, err := mgr.loadMetadata(id) if err != nil { continue } - + // If metadata has a PID, try to kill it if meta.CHPID != nil { pid := *meta.CHPID - + // Check if process exists if err := syscall.Kill(pid, 0); err == nil { t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d (instance %s)", pid, id) syscall.Kill(pid, syscall.SIGKILL) - + // Wait for process to exit WaitForProcessExit(pid, 1*time.Second) } @@ -174,7 +174,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { ctx := context.Background() // Get the image manager from the manager (we need it for image operations) - imageManager, err := images.NewManager(paths.New(tmpDir), 1) + imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil) require.NoError(t, err) // Pull nginx image (runs a daemon, won't exit) @@ -210,7 +210,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Create a volume to attach p := paths.New(tmpDir) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage t.Log("Creating volume...") vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "test-data", @@ -226,12 +226,12 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Create instance with real nginx image and attached volume req := CreateInstanceRequest{ - Name: "test-nginx", - Image: "docker.io/library/nginx:alpine", - Size: 512 * 1024 * 1024, // 512MB - HotplugSize: 512 * 1024 * 1024, // 512MB - OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB - Vcpus: 1, + Name: "test-nginx", + Image: "docker.io/library/nginx:alpine", + Size: 512 * 1024 * 1024, // 512MB + HotplugSize: 512 * 1024 * 1024, // 512MB + OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB + Vcpus: 1, NetworkEnabled: false, // No network for tests Env: map[string]string{ "TEST_VAR": "test_value", @@ -299,33 +299,33 @@ func TestCreateAndDeleteInstance(t *testing.T) { for i := 0; i < 50; i++ { // Poll for up to 5 seconds (50 * 100ms) logs, err = collectLogs(ctx, manager, inst.Id, 100) require.NoError(t, err) - + if strings.Contains(logs, "start worker processes") { foundNginxStartup = true break } time.Sleep(100 * time.Millisecond) } - + t.Logf("Instance logs (last 100 lines):\n%s", logs) - + // Verify nginx started successfully assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 5 seconds") // Test volume is accessible from inside the guest via exec t.Log("Testing volume from inside guest via exec...") - + // Helper to run command in guest with retry (exec agent may need time between connections) runCmd := func(command ...string) (string, int, error) { var lastOutput string var lastExitCode int var lastErr error - + for attempt := 0; attempt < 5; attempt++ { if attempt > 0 { time.Sleep(200 * time.Millisecond) } - + var stdout, stderr bytes.Buffer exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ Command: command, @@ -333,31 +333,31 @@ func TestCreateAndDeleteInstance(t *testing.T) { Stderr: &stderr, TTY: false, }) - + // Combine stdout and stderr output := stdout.String() if stderr.Len() > 0 { output += stderr.String() } output = strings.TrimSpace(output) - + if err != nil { lastErr = err lastOutput = output lastExitCode = -1 continue } - + lastOutput = output lastExitCode = exit.Code lastErr = nil - + // Success if we got output or it's a command expected to have no output if output != "" || exit.Code == 0 { return output, exit.Code, nil } } - + return lastOutput, lastExitCode, lastErr } @@ -375,11 +375,11 @@ func TestCreateAndDeleteInstance(t *testing.T) { echo "=== Volume mount info ===" df -h /mnt/data `, testContent) - + output, exitCode, err := runCmd("sh", "-c", script) require.NoError(t, err, "Volume test script should execute") require.Equal(t, 0, exitCode, "Volume test script should succeed") - + // Verify all expected output is present require.Contains(t, output, "lost+found", "Volume should be ext4-formatted") require.Contains(t, output, testContent, "Should be able to read written content") @@ -455,7 +455,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Verify volume is gone _, err = volumeManager.GetVolume(ctx, vol.Id) assert.ErrorIs(t, err, volumes.ErrNotFound) - + t.Log("Instance and volume lifecycle test complete!") } @@ -464,17 +464,17 @@ func TestStorageOperations(t *testing.T) { tmpDir := t.TempDir() cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", + DataDir: tmpDir, + BridgeName: "vmbr0", SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", + DNSServer: "1.1.1.1", } p := paths.New(tmpDir) - imageManager, _ := images.NewManager(p, 1) + imageManager, _ := images.NewManager(p, 1, nil) systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited @@ -482,7 +482,7 @@ func TestStorageOperations(t *testing.T) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) + manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) // Test metadata doesn't exist initially _, err := manager.loadMetadata("nonexistent") @@ -544,7 +544,7 @@ func TestStandbyAndRestore(t *testing.T) { ctx := context.Background() // Create image manager for pulling nginx - imageManager, err := images.NewManager(paths.New(tmpDir), 1) + imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil) require.NoError(t, err) // Pull nginx image (reuse if already pulled in previous test) @@ -577,14 +577,14 @@ func TestStandbyAndRestore(t *testing.T) { // Create instance t.Log("Creating instance...") req := CreateInstanceRequest{ - Name: "test-standby", - Image: "docker.io/library/nginx:alpine", - Size: 512 * 1024 * 1024, - HotplugSize: 512 * 1024 * 1024, - OverlaySize: 10 * 1024 * 1024 * 1024, - Vcpus: 1, + Name: "test-standby", + Image: "docker.io/library/nginx:alpine", + Size: 512 * 1024 * 1024, + HotplugSize: 512 * 1024 * 1024, + OverlaySize: 10 * 1024 * 1024 * 1024, + Vcpus: 1, NetworkEnabled: false, // No network for tests - Env: map[string]string{}, + Env: map[string]string{}, } inst, err := manager.CreateInstance(ctx, req) @@ -610,7 +610,7 @@ func TestStandbyAndRestore(t *testing.T) { assert.DirExists(t, snapshotDir) assert.FileExists(t, filepath.Join(snapshotDir, "memory-ranges")) // Cloud Hypervisor creates various snapshot files, just verify directory exists - + // DEBUG: Check snapshot files (for comparison with networking test) t.Log("DEBUG: Snapshot files for non-network instance:") entries, _ := os.ReadDir(snapshotDir) @@ -618,7 +618,7 @@ func TestStandbyAndRestore(t *testing.T) { info, _ := entry.Info() t.Logf(" - %s (size: %d bytes)", entry.Name(), info.Size()) } - + // DEBUG: Check console.log file size before restore consoleLogPath := filepath.Join(tmpDir, "guests", inst.Id, "logs", "console.log") var consoleLogSizeBefore int64 @@ -633,7 +633,7 @@ func TestStandbyAndRestore(t *testing.T) { require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) t.Log("Instance restored and running") - + // DEBUG: Check console.log file size after restore if info, err := os.Stat(consoleLogPath); err == nil { consoleLogSizeAfter := info.Size() @@ -648,7 +648,7 @@ func TestStandbyAndRestore(t *testing.T) { t.Log("Cleaning up...") err = manager.DeleteInstance(ctx, inst.Id) require.NoError(t, err) - + t.Log("Standby/restore test complete!") } @@ -685,6 +685,4 @@ func TestStateTransitions(t *testing.T) { } } - // No mock image manager needed - tests use real images! - diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go new file mode 100644 index 00000000..3500aa1e --- /dev/null +++ b/lib/instances/metrics.go @@ -0,0 +1,118 @@ +package instances + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// Metrics holds the metrics instruments for instance operations. +type Metrics struct { + createDuration metric.Float64Histogram + restoreDuration metric.Float64Histogram + standbyDuration metric.Float64Histogram + stateTransitions metric.Int64Counter + tracer trace.Tracer +} + +// newInstanceMetrics creates and registers all instance metrics. +func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_instances_create_duration_seconds", + metric.WithDescription("Time to create an instance"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + restoreDuration, err := meter.Float64Histogram( + "hypeman_instances_restore_duration_seconds", + metric.WithDescription("Time to restore an instance from standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + standbyDuration, err := meter.Float64Histogram( + "hypeman_instances_standby_duration_seconds", + metric.WithDescription("Time to put an instance in standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + stateTransitions, err := meter.Int64Counter( + "hypeman_instances_state_transitions_total", + metric.WithDescription("Total number of instance state transitions"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for instance counts by state + instancesTotal, err := meter.Int64ObservableGauge( + "hypeman_instances_total", + metric.WithDescription("Total number of instances by state"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + instances, err := m.listInstances(ctx) + if err != nil { + return nil + } + stateCounts := make(map[string]int64) + for _, inst := range instances { + stateCounts[string(inst.State)]++ + } + for state, count := range stateCounts { + o.ObserveInt64(instancesTotal, count, + metric.WithAttributes(attribute.String("state", state))) + } + return nil + }, + instancesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + restoreDuration: restoreDuration, + standbyDuration: standbyDuration, + stateTransitions: stateTransitions, + tracer: tracer, + }, nil +} + +// recordDuration records operation duration. +func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + histogram.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordStateTransition records a state transition. +func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) { + if m.metrics == nil { + return + } + m.metrics.stateTransitions.Add(ctx, 1, + metric.WithAttributes( + attribute.String("from", fromState), + attribute.String("to", toState), + )) +} diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go index ac27b561..83930421 100644 --- a/lib/instances/resource_limits_test.go +++ b/lib/instances/resource_limits_test.go @@ -154,23 +154,23 @@ func createTestManager(t *testing.T, limits ResourceLimits) *manager { cfg := &config.Config{DataDir: tmpDir} p := paths.New(cfg.DataDir) - imageMgr, err := images.NewManager(p, 1) + imageMgr, err := images.NewManager(p, 1, nil) require.NoError(t, err) systemMgr := system.NewManager(p) - networkMgr := network.NewManager(p, cfg) - volumeMgr := volumes.NewManager(p, 0) + networkMgr := network.NewManager(p, cfg, nil) + volumeMgr := volumes.NewManager(p, 0, nil) - return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits).(*manager) + return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil).(*manager) } func TestResourceLimits_StructValues(t *testing.T) { limits := ResourceLimits{ - MaxOverlaySize: 10 * 1024 * 1024 * 1024, // 10GB + MaxOverlaySize: 10 * 1024 * 1024 * 1024, // 10GB MaxVcpusPerInstance: 4, - MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB + MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB MaxTotalVcpus: 16, - MaxTotalMemory: 32 * 1024 * 1024 * 1024, // 32GB + MaxTotalMemory: 32 * 1024 * 1024 * 1024, // 32GB } assert.Equal(t, int64(10*1024*1024*1024), limits.MaxOverlaySize) @@ -246,12 +246,12 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { } p := paths.New(tmpDir) - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // Set small aggregate limits: // - MaxTotalVcpus: 2 (first VM gets 1, second wants 2 -> denied) @@ -264,7 +264,7 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { MaxTotalMemory: 2 * 1024 * 1024 * 1024, // aggregate: only 2GB total } - mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) + mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) // Cleanup any orphaned processes on test end t.Cleanup(func() { @@ -375,4 +375,3 @@ func cleanupTestProcesses(t *testing.T, mgr *manager) { } } } - diff --git a/lib/instances/restore.go b/lib/instances/restore.go index ffce3d36..53b7b3e9 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -8,18 +8,27 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/vmm" + "go.opentelemetry.io/otel/trace" ) // RestoreInstance restores an instance from standby // Multi-hop orchestration: Standby → Paused → Running func (m *manager) restoreInstance( ctx context.Context, - + id string, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "restoring instance from standby", "id", id) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "RestoreInstance") + defer span.End() + } + // 1. Load instance meta, err := m.loadMetadata(id) if err != nil { @@ -108,6 +117,12 @@ func (m *manager) restoreInstance( log.WarnContext(ctx, "failed to update metadata after restore", "id", id, "error", err) } + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.restoreDuration, start, "success") + m.recordStateTransition(ctx, string(StateStandby), string(StateRunning)) + } + // Return instance with derived state (should be Running now) finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance restored successfully", "id", id, "state", finalInst.State) @@ -121,14 +136,14 @@ func (m *manager) restoreFromSnapshot( snapshotDir string, ) error { log := logger.FromContext(ctx) - + // Start VMM process and capture PID log.DebugContext(ctx, "starting VMM process for restore", "id", stored.Id, "version", stored.CHVersion) pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath) if err != nil { return fmt.Errorf("start vmm: %w", err) } - + // Store the PID for later cleanup stored.CHPID = &pid log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid) @@ -162,5 +177,3 @@ func (m *manager) restoreFromSnapshot( log.DebugContext(ctx, "VM restored from snapshot successfully", "id", stored.Id) return nil } - - diff --git a/lib/instances/standby.go b/lib/instances/standby.go index ad745164..68e67661 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -9,18 +9,27 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/vmm" + "go.opentelemetry.io/otel/trace" ) // StandbyInstance puts an instance in standby state // Multi-hop orchestration: Running → Paused → Standby func (m *manager) standbyInstance( ctx context.Context, - + id string, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "putting instance in standby", "id", id) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "StandbyInstance") + defer span.End() + } + // 1. Load instance meta, err := m.loadMetadata(id) if err != nil { @@ -110,6 +119,12 @@ func (m *manager) standbyInstance( return nil, fmt.Errorf("save metadata: %w", err) } + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.standbyDuration, start, "success") + m.recordStateTransition(ctx, string(StateRunning), string(StateStandby)) + } + // Return instance with derived state (should be Standby now) finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance put in standby successfully", "id", id, "state", finalInst.State) @@ -131,34 +146,34 @@ func reduceMemory(ctx context.Context, client *vmm.VMM, targetBytes int64) error func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeout time.Duration) error { log := logger.FromContext(ctx) deadline := time.Now().Add(timeout) - + // Use 20ms for fast response with minimal overhead const pollInterval = 20 * time.Millisecond const stabilityThreshold = 3 // Memory unchanged for 3 checks = stable - + var previousSize *int64 unchangedCount := 0 - + for time.Now().Before(deadline) { infoResp, err := client.GetVmInfoWithResponse(ctx) if err != nil { time.Sleep(pollInterval) continue } - + if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { time.Sleep(pollInterval) continue } - + actualSize := infoResp.JSON200.MemoryActualSize if actualSize == nil { time.Sleep(pollInterval) continue } - + currentSize := *actualSize - + // Best case: reached target or below if currentSize <= targetBytes { log.DebugContext(ctx, "memory reduced to target", @@ -166,7 +181,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo "actual_mb", currentSize/(1024*1024)) return nil } - + // Check if memory has stopped shrinking (stabilized above target) if previousSize != nil { if currentSize == *previousSize { @@ -185,11 +200,11 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo unchangedCount = 0 } } - + previousSize = ¤tSize time.Sleep(pollInterval) } - + // Timeout - memory never stabilized return fmt.Errorf("memory reduction did not complete within %v", timeout) } @@ -197,7 +212,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo // createSnapshot creates a Cloud Hypervisor snapshot func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) error { log := logger.FromContext(ctx) - + // Remove old snapshot os.RemoveAll(snapshotDir) @@ -227,7 +242,7 @@ func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) er // shutdownVMM gracefully shuts down the VMM process via API func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { log := logger.FromContext(ctx) - + // Try to connect to VMM client, err := vmm.NewVMM(inst.SocketPath) if err != nil { @@ -239,7 +254,7 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { // Try graceful shutdown log.DebugContext(ctx, "sending shutdown command to VMM", "id", inst.Id) client.ShutdownVMMWithResponse(ctx) - + // Wait for process to exit if inst.CHPID != nil { if !WaitForProcessExit(*inst.CHPID, 2*time.Second) { @@ -248,7 +263,6 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { log.DebugContext(ctx, "VMM shutdown gracefully", "id", inst.Id, "pid", *inst.CHPID) } } - + return nil } - diff --git a/lib/instances/volumes_test.go b/lib/instances/volumes_test.go index c8a36816..5e46d007 100644 --- a/lib/instances/volumes_test.go +++ b/lib/instances/volumes_test.go @@ -67,7 +67,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -92,7 +92,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { t.Log("System files ready") // Create volume - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) t.Log("Creating volume...") vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "shared-data", @@ -250,7 +250,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -272,7 +272,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) { require.NoError(t, err) // Create volume - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "cleanup-test-vol", SizeGb: 1, @@ -359,7 +359,7 @@ func TestVolumeFromArchive(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -393,7 +393,7 @@ func TestVolumeFromArchive(t *testing.T) { archive := createTestTarGz(t, testFiles) // Create volume from archive - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) t.Log("Creating volume from archive...") vol, err := volumeManager.CreateVolumeFromArchive(ctx, volumes.CreateVolumeFromArchiveRequest{ Name: "archive-data", diff --git a/lib/logger/README.md b/lib/logger/README.md new file mode 100644 index 00000000..4996bfeb --- /dev/null +++ b/lib/logger/README.md @@ -0,0 +1,55 @@ +# Logger + +Structured logging with per-subsystem log levels and OpenTelemetry trace context integration. + +## Features + +- Per-subsystem configurable log levels +- Automatic trace_id/span_id injection when OTel is active +- Context-based logger propagation +- JSON output format + +## Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `LOG_LEVEL` | Default log level | `info` | +| `LOG_LEVEL_` | Per-subsystem level | inherits default | + +Subsystems: `API`, `IMAGES`, `INSTANCES`, `NETWORK`, `VOLUMES`, `VMM`, `SYSTEM`, `EXEC` + +Example: +```bash +LOG_LEVEL=info LOG_LEVEL_NETWORK=debug ./hypeman +``` + +## Usage + +```go +// Create subsystem-specific logger +cfg := logger.NewConfig() +log := logger.NewSubsystemLogger(logger.SubsystemInstances, cfg) + +// Add logger to context +ctx = logger.AddToContext(ctx, log) + +// Retrieve from context +log = logger.FromContext(ctx) +log.InfoContext(ctx, "instance created", "id", instanceID) +``` + +## Output + +When OTel tracing is active, logs include trace context: + +```json +{ + "level": "INFO", + "msg": "instance created", + "subsystem": "INSTANCES", + "trace_id": "abc123...", + "span_id": "def456...", + "id": "instance-123" +} +``` + diff --git a/lib/logger/logger.go b/lib/logger/logger.go index b4ac1528..6d60fcde 100644 --- a/lib/logger/logger.go +++ b/lib/logger/logger.go @@ -1,20 +1,231 @@ +// Package logger provides structured logging with subsystem-specific levels +// and OpenTelemetry trace context integration. package logger import ( "context" "log/slog" + "os" + "strings" + + "go.opentelemetry.io/otel/trace" ) type contextKey string const loggerKey contextKey = "logger" -// AddToContext adds a logger to the context +// Subsystem names for per-subsystem logging configuration. +const ( + SubsystemAPI = "API" + SubsystemImages = "IMAGES" + SubsystemInstances = "INSTANCES" + SubsystemNetwork = "NETWORK" + SubsystemVolumes = "VOLUMES" + SubsystemVMM = "VMM" + SubsystemSystem = "SYSTEM" + SubsystemExec = "EXEC" +) + +// Config holds logging configuration. +type Config struct { + // DefaultLevel is the default log level for all subsystems. + DefaultLevel slog.Level + // SubsystemLevels maps subsystem names to their specific log levels. + // If a subsystem is not in this map, DefaultLevel is used. + SubsystemLevels map[string]slog.Level + // AddSource adds source file information to log entries. + AddSource bool +} + +// NewConfig creates a Config from environment variables. +// Reads LOG_LEVEL for default level and LOG_LEVEL_ for per-subsystem levels. +func NewConfig() Config { + cfg := Config{ + DefaultLevel: slog.LevelInfo, + SubsystemLevels: make(map[string]slog.Level), + AddSource: false, + } + + // Parse default level + if levelStr := os.Getenv("LOG_LEVEL"); levelStr != "" { + cfg.DefaultLevel = parseLevel(levelStr) + } + + // Parse subsystem-specific levels + subsystems := []string{ + SubsystemAPI, SubsystemImages, SubsystemInstances, + SubsystemNetwork, SubsystemVolumes, SubsystemVMM, + SubsystemSystem, SubsystemExec, + } + for _, subsystem := range subsystems { + envKey := "LOG_LEVEL_" + subsystem + if levelStr := os.Getenv(envKey); levelStr != "" { + cfg.SubsystemLevels[subsystem] = parseLevel(levelStr) + } + } + + return cfg +} + +// parseLevel parses a log level string. +func parseLevel(s string) slog.Level { + switch strings.ToLower(s) { + case "debug": + return slog.LevelDebug + case "info": + return slog.LevelInfo + case "warn", "warning": + return slog.LevelWarn + case "error": + return slog.LevelError + default: + return slog.LevelInfo + } +} + +// LevelFor returns the log level for the given subsystem. +func (c Config) LevelFor(subsystem string) slog.Level { + if level, ok := c.SubsystemLevels[subsystem]; ok { + return level + } + return c.DefaultLevel +} + +// NewLogger creates a new slog.Logger with JSON output. +func NewLogger(cfg Config) *slog.Logger { + return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: cfg.DefaultLevel, + AddSource: cfg.AddSource, + })) +} + +// NewSubsystemLogger creates a logger for a specific subsystem with its configured level. +// If otelHandler is provided, logs will be sent both to stdout and to OTel. +func NewSubsystemLogger(subsystem string, cfg Config, otelHandler slog.Handler) *slog.Logger { + level := cfg.LevelFor(subsystem) + jsonHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: level, + AddSource: cfg.AddSource, + }) + + var baseHandler slog.Handler + if otelHandler != nil { + // Use multi-handler to write to both stdout and OTel + baseHandler = &multiHandler{ + handlers: []slog.Handler{jsonHandler, otelHandler}, + } + } else { + baseHandler = jsonHandler + } + + // Wrap with trace context handler for trace IDs in logs + wrappedHandler := &traceContextHandler{ + Handler: baseHandler, + subsystem: subsystem, + level: level, + } + return slog.New(wrappedHandler) +} + +// traceContextHandler wraps a slog.Handler to add trace context and subsystem. +type traceContextHandler struct { + slog.Handler + subsystem string + level slog.Level +} + +// Enabled reports whether the handler handles records at the given level. +func (h *traceContextHandler) Enabled(ctx context.Context, level slog.Level) bool { + return level >= h.level +} + +// Handle adds trace_id and span_id from the context if available. +func (h *traceContextHandler) Handle(ctx context.Context, r slog.Record) error { + // Add subsystem attribute + r.AddAttrs(slog.String("subsystem", h.subsystem)) + + // Add trace context if available + spanCtx := trace.SpanContextFromContext(ctx) + if spanCtx.IsValid() { + r.AddAttrs( + slog.String("trace_id", spanCtx.TraceID().String()), + slog.String("span_id", spanCtx.SpanID().String()), + ) + } + + return h.Handler.Handle(ctx, r) +} + +// WithAttrs returns a new handler with the given attributes. +func (h *traceContextHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + return &traceContextHandler{ + Handler: h.Handler.WithAttrs(attrs), + subsystem: h.subsystem, + level: h.level, + } +} + +// WithGroup returns a new handler with the given group name. +func (h *traceContextHandler) WithGroup(name string) slog.Handler { + return &traceContextHandler{ + Handler: h.Handler.WithGroup(name), + subsystem: h.subsystem, + level: h.level, + } +} + +// multiHandler fans out log records to multiple handlers. +type multiHandler struct { + handlers []slog.Handler +} + +// Enabled returns true if any handler is enabled. +func (m *multiHandler) Enabled(ctx context.Context, level slog.Level) bool { + for _, h := range m.handlers { + if h.Enabled(ctx, level) { + return true + } + } + return false +} + +// Handle writes the record to all handlers. +func (m *multiHandler) Handle(ctx context.Context, r slog.Record) error { + for _, h := range m.handlers { + if h.Enabled(ctx, r.Level) { + if err := h.Handle(ctx, r); err != nil { + return err + } + } + } + return nil +} + +// WithAttrs returns a new multiHandler with the given attributes. +func (m *multiHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + handlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + handlers[i] = h.WithAttrs(attrs) + } + return &multiHandler{handlers: handlers} +} + +// WithGroup returns a new multiHandler with the given group name. +func (m *multiHandler) WithGroup(name string) slog.Handler { + handlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + handlers[i] = h.WithGroup(name) + } + return &multiHandler{handlers: handlers} +} + +// AddToContext adds a logger to the context. func AddToContext(ctx context.Context, logger *slog.Logger) context.Context { return context.WithValue(ctx, loggerKey, logger) } -// FromContext retrieves the logger from context, or returns default +// FromContext retrieves the logger from context, or returns default. func FromContext(ctx context.Context) *slog.Logger { if logger, ok := ctx.Value(loggerKey).(*slog.Logger); ok { return logger @@ -22,3 +233,7 @@ func FromContext(ctx context.Context) *slog.Logger { return slog.Default() } +// With returns a logger with additional attributes. +func With(logger *slog.Logger, args ...any) *slog.Logger { + return logger.With(args...) +} diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go new file mode 100644 index 00000000..cac161c9 --- /dev/null +++ b/lib/middleware/otel.go @@ -0,0 +1,202 @@ +package middleware + +import ( + "bufio" + "fmt" + "log/slog" + "net" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/onkernel/hypeman/lib/logger" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// HTTPMetrics holds the OTel metrics for HTTP requests. +type HTTPMetrics struct { + requestsTotal metric.Int64Counter + requestDuration metric.Float64Histogram +} + +// NewHTTPMetrics creates new HTTP metrics instruments. +func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) { + requestsTotal, err := meter.Int64Counter( + "hypeman_http_requests_total", + metric.WithDescription("Total number of HTTP requests"), + ) + if err != nil { + return nil, err + } + + requestDuration, err := meter.Float64Histogram( + "hypeman_http_request_duration_seconds", + metric.WithDescription("HTTP request duration in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + return &HTTPMetrics{ + requestsTotal: requestsTotal, + requestDuration: requestDuration, + }, nil +} + +// Middleware returns an HTTP middleware that records metrics. +func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + + // Wrap response writer to capture status code and bytes + wrapped := wrapResponseWriter(w) + + // Process request + next.ServeHTTP(wrapped, r) + + // Calculate duration + duration := time.Since(start).Seconds() + + // Get route pattern if available (chi specific) + routePattern := chi.RouteContext(r.Context()).RoutePattern() + if routePattern == "" { + routePattern = r.URL.Path + } + + // Record metrics + attrs := []attribute.KeyValue{ + attribute.String("method", r.Method), + attribute.String("path", routePattern), + attribute.Int("status", wrapped.Status()), + } + + m.requestsTotal.Add(r.Context(), 1, metric.WithAttributes(attrs...)) + m.requestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...)) + }) +} + +// NoopHTTPMetrics returns a middleware that does nothing (for when OTel is disabled). +func NoopHTTPMetrics() func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return next + } +} + +// AccessLogger returns a middleware that logs HTTP requests using slog with trace context. +// This replaces chi's middleware.Logger to get logs into OTel/Loki with trace correlation. +func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + + // Wrap response writer to capture status code and bytes + wrapped := wrapResponseWriter(w) + + // Process request + next.ServeHTTP(wrapped, r) + + // Get route pattern + routePattern := chi.RouteContext(r.Context()).RoutePattern() + if routePattern == "" { + routePattern = r.URL.Path + } + + // Log with trace context from request context + duration := time.Since(start) + log.InfoContext(r.Context(), + fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.Status(), wrapped.BytesWritten(), duration.Milliseconds()), + "method", r.Method, + "path", routePattern, + "status", wrapped.Status(), + "bytes", wrapped.BytesWritten(), + "duration_ms", duration.Milliseconds(), + "remote_addr", r.RemoteAddr, + ) + }) + } +} + +// NewAccessLogger creates an access logger with OTel handler if available. +func NewAccessLogger(otelHandler slog.Handler) *slog.Logger { + cfg := logger.NewConfig() + return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler) +} + +// InjectLogger returns middleware that adds the logger to the request context. +// This enables handlers to use logger.FromContext(ctx) with trace correlation. +func InjectLogger(log *slog.Logger) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := logger.AddToContext(r.Context(), log) + next.ServeHTTP(w, r.WithContext(ctx)) + }) + } +} + +// responseWriter wraps http.ResponseWriter to capture status code and bytes written. +// It also implements http.Flusher and http.Hijacker when the underlying writer supports them. +type responseWriter struct { + http.ResponseWriter + statusCode int + bytesWritten int + wroteHeader bool +} + +// wrapResponseWriter creates a new responseWriter wrapper. +func wrapResponseWriter(w http.ResponseWriter) *responseWriter { + return &responseWriter{ + ResponseWriter: w, + statusCode: http.StatusOK, + } +} + +// Status returns the HTTP status code. +func (rw *responseWriter) Status() int { + return rw.statusCode +} + +// BytesWritten returns the number of bytes written. +func (rw *responseWriter) BytesWritten() int { + return rw.bytesWritten +} + +// WriteHeader captures the status code before calling the underlying WriteHeader. +func (rw *responseWriter) WriteHeader(code int) { + if !rw.wroteHeader { + rw.statusCode = code + rw.wroteHeader = true + rw.ResponseWriter.WriteHeader(code) + } +} + +// Write captures bytes written and calls the underlying Write. +func (rw *responseWriter) Write(b []byte) (int, error) { + if !rw.wroteHeader { + rw.WriteHeader(http.StatusOK) + } + n, err := rw.ResponseWriter.Write(b) + rw.bytesWritten += n + return n, err +} + +// Unwrap provides access to the underlying ResponseWriter for http.ResponseController. +func (rw *responseWriter) Unwrap() http.ResponseWriter { + return rw.ResponseWriter +} + +// Flush implements http.Flusher. It delegates to the underlying writer if supported. +func (rw *responseWriter) Flush() { + if f, ok := rw.ResponseWriter.(http.Flusher); ok { + f.Flush() + } +} + +// Hijack implements http.Hijacker. It delegates to the underlying writer if supported. +func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) { + if h, ok := rw.ResponseWriter.(http.Hijacker); ok { + return h.Hijack() + } + return nil, nil, fmt.Errorf("underlying ResponseWriter does not implement http.Hijacker") +} diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 5396627d..b3f35900 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -59,6 +59,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N if err := m.createTAPDevice(tap, network.Bridge, network.Isolated); err != nil { return nil, fmt.Errorf("create TAP device: %w", err) } + m.recordTAPOperation(ctx, "create") log.InfoContext(ctx, "allocated network", "instance_id", req.InstanceID, @@ -111,6 +112,7 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string) err if err := m.createTAPDevice(alloc.TAPDevice, network.Bridge, network.Isolated); err != nil { return fmt.Errorf("create TAP device: %w", err) } + m.recordTAPOperation(ctx, "create") log.InfoContext(ctx, "recreated network for restore", "instance_id", instanceID, @@ -138,6 +140,8 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro // 1. Delete TAP device (best effort) if err := m.deleteTAPDevice(alloc.TAPDevice); err != nil { log.WarnContext(ctx, "failed to delete TAP device", "tap", alloc.TAPDevice, "error", err) + } else { + m.recordTAPOperation(ctx, "delete") } log.InfoContext(ctx, "released network", @@ -270,4 +274,3 @@ func generateTAPName(instanceID string) string { } return TAPPrefix + strings.ToLower(shortID) } - diff --git a/lib/network/manager.go b/lib/network/manager.go index cb43af07..a48d3341 100644 --- a/lib/network/manager.go +++ b/lib/network/manager.go @@ -9,6 +9,7 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/metric" ) // Manager defines the interface for network management @@ -29,17 +30,29 @@ type Manager interface { // manager implements the Manager interface type manager struct { - paths *paths.Paths - config *config.Config - mu sync.Mutex // Protects network allocation operations (IP allocation) + paths *paths.Paths + config *config.Config + mu sync.Mutex // Protects network allocation operations (IP allocation) + metrics *Metrics } -// NewManager creates a new network manager -func NewManager(p *paths.Paths, cfg *config.Config) Manager { - return &manager{ +// NewManager creates a new network manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, cfg *config.Config, meter metric.Meter) Manager { + m := &manager{ paths: p, config: cfg, } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newNetworkMetrics(meter, m) + if err == nil { + m.metrics = metrics + } + } + + return m } // Initialize initializes the network manager and creates default network. @@ -100,4 +113,3 @@ func (m *manager) getDefaultNetwork(ctx context.Context) (*Network, error) { CreatedAt: time.Time{}, // Unknown for default }, nil } - diff --git a/lib/network/metrics.go b/lib/network/metrics.go new file mode 100644 index 00000000..48cbd1c0 --- /dev/null +++ b/lib/network/metrics.go @@ -0,0 +1,61 @@ +package network + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for network operations. +type Metrics struct { + tapOperations metric.Int64Counter +} + +// newNetworkMetrics creates and registers all network metrics. +func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + tapOperations, err := meter.Int64Counter( + "hypeman_network_tap_operations_total", + metric.WithDescription("Total number of TAP device operations"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for allocations + allocationsTotal, err := meter.Int64ObservableGauge( + "hypeman_network_allocations_total", + metric.WithDescription("Total number of active network allocations"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + allocs, err := m.ListAllocations(ctx) + if err != nil { + return nil + } + o.ObserveInt64(allocationsTotal, int64(len(allocs))) + return nil + }, + allocationsTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + tapOperations: tapOperations, + }, nil +} + +// recordTAPOperation records a TAP device operation. +func (m *manager) recordTAPOperation(ctx context.Context, operation string) { + if m.metrics == nil { + return + } + m.metrics.tapOperations.Add(ctx, 1, + metric.WithAttributes(attribute.String("operation", operation))) +} diff --git a/lib/otel/README.md b/lib/otel/README.md new file mode 100644 index 00000000..6db7a686 --- /dev/null +++ b/lib/otel/README.md @@ -0,0 +1,104 @@ +# OpenTelemetry + +Provides OpenTelemetry initialization and metric definitions for Hypeman. + +## Features + +- OTLP export for traces, metrics, and logs (gRPC) +- Runtime metrics (Go GC, goroutines, memory) +- Application-specific metrics per subsystem +- Log bridging from slog to OTel (viewable in Grafana/Loki) +- Graceful degradation (failures don't crash the app) + +## Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `ENV` | Deployment environment (`deployment.environment` attribute) | `unset` | +| `OTEL_ENABLED` | Enable OpenTelemetry | `false` | +| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `127.0.0.1:4317` | +| `OTEL_SERVICE_NAME` | Service name | `hypeman` | +| `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname | +| `OTEL_INSECURE` | Disable TLS for OTLP | `true` | + +## Metrics + +### System +| Metric | Type | Description | +|--------|------|-------------| +| `hypeman_uptime_seconds` | gauge | Process uptime | +| `hypeman_info` | gauge | Build info (version, go_version labels) | + +### HTTP +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_http_requests_total` | counter | method, path, status | Total HTTP requests | +| `hypeman_http_request_duration_seconds` | histogram | method, path, status | Request latency | + +### Images +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_images_build_queue_length` | gauge | | Current build queue size | +| `hypeman_images_build_duration_seconds` | histogram | status | Image build time | +| `hypeman_images_total` | gauge | status | Cached images count | +| `hypeman_images_pulls_total` | counter | status | Registry pulls | + +### Instances +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_instances_total` | gauge | state | Instances by state | +| `hypeman_instances_create_duration_seconds` | histogram | status | Create time | +| `hypeman_instances_restore_duration_seconds` | histogram | status | Restore time | +| `hypeman_instances_standby_duration_seconds` | histogram | status | Standby time | +| `hypeman_instances_state_transitions_total` | counter | from, to | State transitions | + +### Network +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_network_allocations_total` | gauge | | Active IP allocations | +| `hypeman_network_tap_operations_total` | counter | operation | TAP create/delete ops | + +### Volumes +| Metric | Type | Description | +|--------|------|-------------| +| `hypeman_volumes_total` | gauge | Volume count | +| `hypeman_volumes_allocated_bytes` | gauge | Total provisioned size | +| `hypeman_volumes_used_bytes` | gauge | Actual disk space consumed | +| `hypeman_volumes_create_duration_seconds` | histogram | Creation time | + +### VMM +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_vmm_api_duration_seconds` | histogram | operation, status | CH API latency | +| `hypeman_vmm_api_errors_total` | counter | operation | CH API errors | + +### Exec +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_exec_sessions_total` | counter | status, exit_code | Exec sessions | +| `hypeman_exec_duration_seconds` | histogram | status | Command duration | +| `hypeman_exec_bytes_sent_total` | counter | | Bytes to guest (stdin) | +| `hypeman_exec_bytes_received_total` | counter | | Bytes from guest (stdout+stderr) | + +## Usage + +```go +provider, shutdown, err := otel.Init(ctx, otel.Config{ + Enabled: true, + Endpoint: "localhost:4317", + ServiceName: "hypeman", +}) +defer shutdown(ctx) + +meter := provider.Meter // Use for creating metrics +tracer := provider.Tracer // Use for creating traces +logHandler := provider.LogHandler // Use with slog for logs to OTel +``` + +## Logs + +Logs are exported via the OTel log bridge (`otelslog`). When OTel is enabled, all slog logs are sent to Loki (via OTLP) and include: +- `subsystem` attribute (API, IMAGES, INSTANCES, etc.) +- `trace_id` and `span_id` when available +- Service attributes (name, instance, environment) + diff --git a/lib/otel/otel.go b/lib/otel/otel.go new file mode 100644 index 00000000..2b1c2e41 --- /dev/null +++ b/lib/otel/otel.go @@ -0,0 +1,265 @@ +// Package otel provides OpenTelemetry initialization and configuration. +package otel + +import ( + "context" + "fmt" + "log/slog" + goruntime "runtime" + "time" + + "go.opentelemetry.io/contrib/bridges/otelslog" + otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" + sdklog "go.opentelemetry.io/otel/sdk/log" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.37.0" + "go.opentelemetry.io/otel/trace" +) + +// Config holds OpenTelemetry configuration. +type Config struct { + Enabled bool + Endpoint string + ServiceName string + ServiceInstanceID string + Insecure bool + Version string + Env string +} + +// Provider holds initialized OTel providers. +type Provider struct { + TracerProvider *sdktrace.TracerProvider + MeterProvider *sdkmetric.MeterProvider + LoggerProvider *sdklog.LoggerProvider + Tracer trace.Tracer + Meter metric.Meter + LogHandler slog.Handler + startTime time.Time +} + +// Init initializes OpenTelemetry with the given configuration. +// Returns a shutdown function that should be called on application exit. +// If OTel is disabled, returns a no-op shutdown function. +func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) error, error) { + if !cfg.Enabled { + // Return no-op provider when disabled + return &Provider{ + Tracer: otel.Tracer(cfg.ServiceName), + Meter: otel.Meter(cfg.ServiceName), + startTime: time.Now(), + }, func(context.Context) error { return nil }, nil + } + + // Create resource with service information + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.Version), + semconv.ServiceInstanceID(cfg.ServiceInstanceID), + semconv.DeploymentEnvironmentName(cfg.Env), + ), + ) + if err != nil { + return nil, nil, fmt.Errorf("create resource: %w", err) + } + + // Create trace exporter + traceOpts := []otlptracegrpc.Option{ + otlptracegrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + traceOpts = append(traceOpts, otlptracegrpc.WithInsecure()) + } + traceExporter, err := otlptracegrpc.New(ctx, traceOpts...) + if err != nil { + return nil, nil, fmt.Errorf("create trace exporter: %w", err) + } + + // Create tracer provider + tracerProvider := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(traceExporter), + sdktrace.WithResource(res), + ) + + // Create metric exporter + metricOpts := []otlpmetricgrpc.Option{ + otlpmetricgrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + metricOpts = append(metricOpts, otlpmetricgrpc.WithInsecure()) + } + metricExporter, err := otlpmetricgrpc.New(ctx, metricOpts...) + if err != nil { + tracerProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("create metric exporter: %w", err) + } + + // Create meter provider + meterProvider := sdkmetric.NewMeterProvider( + sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExporter)), + sdkmetric.WithResource(res), + ) + + // Create log exporter + logOpts := []otlploggrpc.Option{ + otlploggrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + logOpts = append(logOpts, otlploggrpc.WithInsecure()) + } + logExporter, err := otlploggrpc.New(ctx, logOpts...) + if err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("create log exporter: %w", err) + } + + // Create logger provider + loggerProvider := sdklog.NewLoggerProvider( + sdklog.WithProcessor(sdklog.NewBatchProcessor(logExporter)), + sdklog.WithResource(res), + ) + + // Set global providers + otel.SetTracerProvider(tracerProvider) + otel.SetMeterProvider(meterProvider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + // Start runtime metrics collection + if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + loggerProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("start runtime metrics: %w", err) + } + + // Create slog handler that bridges to OTel + logHandler := otelslog.NewHandler(cfg.ServiceName, otelslog.WithLoggerProvider(loggerProvider)) + + provider := &Provider{ + TracerProvider: tracerProvider, + MeterProvider: meterProvider, + LoggerProvider: loggerProvider, + Tracer: tracerProvider.Tracer(cfg.ServiceName), + Meter: meterProvider.Meter(cfg.ServiceName), + LogHandler: logHandler, + startTime: time.Now(), + } + + // Register system metrics (uptime, info) + if err := provider.registerSystemMetrics(cfg); err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + loggerProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("register system metrics: %w", err) + } + + shutdown := func(ctx context.Context) error { + var errs []error + if err := tracerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown tracer: %w", err)) + } + if err := meterProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown meter: %w", err)) + } + if err := loggerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown logger: %w", err)) + } + if len(errs) > 0 { + return fmt.Errorf("shutdown errors: %v", errs) + } + return nil + } + + return provider, shutdown, nil +} + +// registerSystemMetrics registers uptime and info metrics. +func (p *Provider) registerSystemMetrics(cfg Config) error { + // Uptime gauge + uptime, err := p.Meter.Float64ObservableGauge( + "hypeman_uptime_seconds", + metric.WithDescription("Process uptime in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + return fmt.Errorf("create uptime gauge: %w", err) + } + + // Info gauge (always 1, with version labels) + info, err := p.Meter.Int64ObservableGauge( + "hypeman_info", + metric.WithDescription("Hypeman build information"), + ) + if err != nil { + return fmt.Errorf("create info gauge: %w", err) + } + + _, err = p.Meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + o.ObserveFloat64(uptime, time.Since(p.startTime).Seconds()) + o.ObserveInt64(info, 1, + metric.WithAttributes( + semconv.ServiceVersion(cfg.Version), + semconv.TelemetrySDKLanguageGo, + ), + ) + return nil + }, + uptime, + info, + ) + if err != nil { + return fmt.Errorf("register callback: %w", err) + } + + return nil +} + +// Tracer returns a tracer for the given subsystem. +func (p *Provider) TracerFor(subsystem string) trace.Tracer { + if p.TracerProvider != nil { + return p.TracerProvider.Tracer(subsystem) + } + return otel.Tracer(subsystem) +} + +// Meter returns a meter for the given subsystem. +func (p *Provider) MeterFor(subsystem string) metric.Meter { + if p.MeterProvider != nil { + return p.MeterProvider.Meter(subsystem) + } + return otel.Meter(subsystem) +} + +// GoVersion returns the Go version used to build the binary. +func GoVersion() string { + return goruntime.Version() +} + +// globalLogHandler holds the OTel log handler for use by the logger package. +var globalLogHandler slog.Handler + +// SetGlobalLogHandler sets the global OTel log handler. +func SetGlobalLogHandler(h slog.Handler) { + globalLogHandler = h +} + +// GetGlobalLogHandler returns the global OTel log handler. +func GetGlobalLogHandler() slog.Handler { + return globalLogHandler +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 2daa0749..010ee3e6 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log/slog" - "os" "github.com/c2h5oh/datasize" "github.com/onkernel/hypeman/cmd/api/config" @@ -12,16 +11,18 @@ import ( "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" + hypemanotel "github.com/onkernel/hypeman/lib/otel" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel" ) -// ProvideLogger provides a structured logger +// ProvideLogger provides a structured logger with subsystem-specific levels func ProvideLogger() *slog.Logger { - return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ - Level: slog.LevelInfo, - })) + cfg := logger.NewConfig() + otelHandler := hypemanotel.GetGlobalLogHandler() + return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler) } // ProvideContext provides a context with logger attached @@ -41,7 +42,8 @@ func ProvidePaths(cfg *config.Config) *paths.Paths { // ProvideImageManager provides the image manager func ProvideImageManager(p *paths.Paths, cfg *config.Config) (images.Manager, error) { - return images.NewManager(p, cfg.MaxConcurrentBuilds) + meter := otel.GetMeterProvider().Meter("hypeman") + return images.NewManager(p, cfg.MaxConcurrentBuilds, meter) } // ProvideSystemManager provides the system manager @@ -51,7 +53,8 @@ func ProvideSystemManager(p *paths.Paths) system.Manager { // ProvideNetworkManager provides the network manager func ProvideNetworkManager(p *paths.Paths, cfg *config.Config) network.Manager { - return network.NewManager(p, cfg) + meter := otel.GetMeterProvider().Meter("hypeman") + return network.NewManager(p, cfg, meter) } // ProvideInstanceManager provides the instance manager @@ -90,7 +93,9 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima MaxTotalMemory: maxTotalMemory, } - return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits), nil + meter := otel.GetMeterProvider().Meter("hypeman") + tracer := otel.GetTracerProvider().Tracer("hypeman") + return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, meter, tracer), nil } // ProvideVolumeManager provides the volume manager @@ -105,5 +110,6 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager, maxTotalVolumeStorage = int64(storageSize) } - return volumes.NewManager(p, maxTotalVolumeStorage), nil + meter := otel.GetMeterProvider().Meter("hypeman") + return volumes.NewManager(p, maxTotalVolumeStorage, meter), nil } diff --git a/lib/vmm/client.go b/lib/vmm/client.go index c70a6572..7fc7a906 100644 --- a/lib/vmm/client.go +++ b/lib/vmm/client.go @@ -12,6 +12,8 @@ import ( "time" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" ) // VMM wraps the generated Cloud Hypervisor client (API v0.3.0) @@ -20,15 +22,45 @@ type VMM struct { socketPath string } +// metricsRoundTripper wraps an http.RoundTripper to record metrics +type metricsRoundTripper struct { + base http.RoundTripper +} + +func (m *metricsRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + start := time.Now() + resp, err := m.base.RoundTrip(req) + + // Record metrics using global VMMMetrics + if VMMMetrics != nil { + operation := req.Method + " " + req.URL.Path + status := "success" + if err != nil || (resp != nil && resp.StatusCode >= 400) { + status = "error" + VMMMetrics.APIErrorsTotal.Add(req.Context(), 1, + metric.WithAttributes(attribute.String("operation", operation))) + } + VMMMetrics.APIDuration.Record(req.Context(), time.Since(start).Seconds(), + metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("status", status), + )) + } + + return resp, err +} + // NewVMM creates a Cloud Hypervisor client for an existing VMM socket func NewVMM(socketPath string) (*VMM, error) { - httpClient := &http.Client{ - Transport: &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - return net.Dial("unix", socketPath) - }, + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", socketPath) }, - Timeout: 30 * time.Second, + } + + httpClient := &http.Client{ + Transport: &metricsRoundTripper{base: transport}, + Timeout: 30 * time.Second, } client, err := NewClientWithResponses("http://localhost/api/v1", @@ -71,15 +103,15 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion // Build command arguments args := []string{"--api-socket", socketPath} args = append(args, extraArgs...) - + // Use Command (not CommandContext) so process survives parent context cancellation cmd := exec.Command(binaryPath, args...) - + // Daemonize: detach from parent process group cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, // Create new process group } - + // Redirect stdout/stderr to log files (process won't block on I/O) instanceDir := filepath.Dir(socketPath) stdoutFile, err := os.OpenFile( @@ -94,7 +126,7 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion // The child process receives duplicated file descriptors during fork/exec, // so it can continue writing to the log files even after we close them here. defer stdoutFile.Close() - + stderrFile, err := os.OpenFile( filepath.Join(instanceDir, "ch-stderr.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, @@ -104,24 +136,24 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion return 0, fmt.Errorf("create stderr log: %w", err) } defer stderrFile.Close() - + cmd.Stdout = stdoutFile cmd.Stderr = stderrFile if err := cmd.Start(); err != nil { return 0, fmt.Errorf("start cloud-hypervisor: %w", err) } - + pid := cmd.Process.Pid // Wait for socket to be ready (use fresh context with timeout, not parent context) waitCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() - + if err := waitForSocket(waitCtx, socketPath, 5*time.Second); err != nil { return 0, err } - + return pid, nil } diff --git a/lib/vmm/metrics.go b/lib/vmm/metrics.go new file mode 100644 index 00000000..19b514cf --- /dev/null +++ b/lib/vmm/metrics.go @@ -0,0 +1,75 @@ +package vmm + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for VMM operations. +type Metrics struct { + APIDuration metric.Float64Histogram + APIErrorsTotal metric.Int64Counter +} + +// VMMMetrics is the global metrics instance for the vmm package. +// Set this via SetMetrics() during application initialization. +var VMMMetrics *Metrics + +// SetMetrics sets the global metrics instance. +func SetMetrics(m *Metrics) { + VMMMetrics = m +} + +// NewMetrics creates VMM metrics instruments. +// If meter is nil, returns nil (metrics disabled). +func NewMetrics(meter metric.Meter) (*Metrics, error) { + if meter == nil { + return nil, nil + } + + apiDuration, err := meter.Float64Histogram( + "hypeman_vmm_api_duration_seconds", + metric.WithDescription("Cloud Hypervisor API call duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + apiErrorsTotal, err := meter.Int64Counter( + "hypeman_vmm_api_errors_total", + metric.WithDescription("Total number of Cloud Hypervisor API errors"), + ) + if err != nil { + return nil, err + } + + return &Metrics{ + APIDuration: apiDuration, + APIErrorsTotal: apiErrorsTotal, + }, nil +} + +// RecordAPICall records the duration and status of an API call. +func (m *Metrics) RecordAPICall(ctx context.Context, operation string, start time.Time, err error) { + if m == nil { + return + } + + duration := time.Since(start).Seconds() + status := "success" + if err != nil { + status = "error" + m.APIErrorsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("operation", operation))) + } + + m.APIDuration.Record(ctx, duration, + metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("status", status), + )) +} diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go index 2292e98c..d3a3dfc8 100644 --- a/lib/volumes/manager.go +++ b/lib/volumes/manager.go @@ -11,6 +11,7 @@ import ( "github.com/nrednav/cuid2" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/metric" ) // Manager provides volume lifecycle operations @@ -38,16 +39,28 @@ type manager struct { paths *paths.Paths maxTotalVolumeStorage int64 // Maximum total volume storage in bytes (0 = unlimited) volumeLocks sync.Map // map[string]*sync.RWMutex - per-volume locks + metrics *Metrics } -// NewManager creates a new volumes manager -// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited) -func NewManager(p *paths.Paths, maxTotalVolumeStorage int64) Manager { - return &manager{ +// NewManager creates a new volumes manager. +// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited). +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, maxTotalVolumeStorage int64, meter metric.Meter) Manager { + m := &manager{ paths: p, maxTotalVolumeStorage: maxTotalVolumeStorage, volumeLocks: sync.Map{}, } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newVolumeMetrics(meter, m) + if err == nil { + m.metrics = metrics + } + } + + return m } // getVolumeLock returns or creates a lock for a specific volume @@ -92,6 +105,8 @@ func (m *manager) calculateTotalVolumeStorage(ctx context.Context) (int64, error // CreateVolume creates a new volume func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*Volume, error) { + start := time.Now() + // Generate or use provided ID id := cuid2.Generate() if req.Id != nil && *req.Id != "" { @@ -145,12 +160,15 @@ func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*V return nil, err } + m.recordCreateDuration(ctx, start, "success") return m.metadataToVolume(meta), nil } // CreateVolumeFromArchive creates a new volume pre-populated with content from a tar.gz archive. // The archive is safely extracted with size limits to prevent tar bombs. func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeFromArchiveRequest, archive io.Reader) (*Volume, error) { + start := time.Now() + // Generate or use provided ID id := cuid2.Generate() if req.Id != nil && *req.Id != "" { @@ -223,6 +241,7 @@ func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeF return nil, err } + m.recordCreateDuration(ctx, start, "success") return m.metadataToVolume(meta), nil } diff --git a/lib/volumes/manager_test.go b/lib/volumes/manager_test.go index 3752e230..130fb1c4 100644 --- a/lib/volumes/manager_test.go +++ b/lib/volumes/manager_test.go @@ -23,7 +23,7 @@ func setupTestManager(t *testing.T) (Manager, *paths.Paths, func()) { // Create required directories require.NoError(t, os.MkdirAll(p.VolumesDir(), 0755)) - manager := NewManager(p, 0) // 0 = unlimited storage + manager := NewManager(p, 0, nil) // 0 = unlimited storage cleanup := func() { os.RemoveAll(tmpDir) @@ -389,4 +389,3 @@ func TestMultiAttach_ConcurrentRWConflict(t *testing.T) { assert.Len(t, vol.Attachments, 1, "Should have exactly one attachment") assert.False(t, vol.Attachments[0].Readonly, "Attachment should be read-write") } - diff --git a/lib/volumes/metrics.go b/lib/volumes/metrics.go new file mode 100644 index 00000000..ad8db491 --- /dev/null +++ b/lib/volumes/metrics.go @@ -0,0 +1,105 @@ +package volumes + +import ( + "context" + "os" + "syscall" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for volume operations. +type Metrics struct { + createDuration metric.Float64Histogram +} + +// newVolumeMetrics creates and registers all volume metrics. +func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_volumes_create_duration_seconds", + metric.WithDescription("Time to create a volume"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges + volumesTotal, err := meter.Int64ObservableGauge( + "hypeman_volumes_total", + metric.WithDescription("Total number of volumes"), + ) + if err != nil { + return nil, err + } + + allocatedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_allocated_bytes", + metric.WithDescription("Total allocated/provisioned volume size in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + usedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_used_bytes", + metric.WithDescription("Actual disk space consumed by volumes in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + volumes, err := m.ListVolumes(ctx) + if err != nil { + return nil + } + + o.ObserveInt64(volumesTotal, int64(len(volumes))) + + var totalAllocated, totalUsed int64 + for _, vol := range volumes { + // Allocated = provisioned size in GB + totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024 + + // Used = actual disk blocks consumed (for sparse files) + diskPath := m.paths.VolumeData(vol.Id) + if stat, err := os.Stat(diskPath); err == nil { + // Get actual blocks used via syscall + if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok { + totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units + } + } + } + + o.ObserveInt64(allocatedBytes, totalAllocated) + o.ObserveInt64(usedBytes, totalUsed) + return nil + }, + volumesTotal, + allocatedBytes, + usedBytes, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + }, nil +} + +// recordCreateDuration records the volume creation duration. +func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.createDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +}