From 8658eca200924f15096536ac54543bf17280009b Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 13:10:26 -0500 Subject: [PATCH 01/14] Add tracing, metrics, configurable logging --- cmd/api/api/api_test.go | 8 +- cmd/api/config/config.go | 34 ++- cmd/api/main.go | 63 ++++- go.mod | 19 +- go.sum | 27 ++- lib/exec/client.go | 16 +- lib/exec/metrics.go | 105 +++++++++ lib/images/manager.go | 121 +++++++++- lib/images/manager_test.go | 30 +-- lib/images/queue.go | 6 + lib/instances/create.go | 26 ++- lib/instances/exec_test.go | 3 +- lib/instances/manager.go | 132 ++++++++++- lib/instances/manager_test.go | 142 ++++++------ lib/instances/resource_limits_test.go | 23 +- lib/instances/restore.go | 25 +- lib/instances/standby.go | 48 ++-- lib/instances/volumes_test.go | 12 +- lib/logger/README.md | 55 +++++ lib/logger/logger.go | 153 ++++++++++++- lib/middleware/otel.go | 99 ++++++++ lib/network/allocate.go | 5 +- lib/network/manager.go | 80 ++++++- lib/otel/README.md | 92 ++++++++ lib/otel/metrics.go | 317 ++++++++++++++++++++++++++ lib/otel/otel.go | 214 +++++++++++++++++ lib/providers/providers.go | 16 +- lib/vmm/metrics.go | 66 ++++++ lib/volumes/manager.go | 123 +++++++++- lib/volumes/manager_test.go | 3 +- 30 files changed, 1881 insertions(+), 182 deletions(-) create mode 100644 lib/exec/metrics.go create mode 100644 lib/logger/README.md create mode 100644 lib/middleware/otel.go create mode 100644 lib/otel/README.md create mode 100644 lib/otel/metrics.go create mode 100644 lib/otel/otel.go create mode 100644 lib/vmm/metrics.go diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go index ab536869..9f010002 100644 --- a/cmd/api/api/api_test.go +++ b/cmd/api/api/api_test.go @@ -26,18 +26,18 @@ func newTestService(t *testing.T) *ApiService { } p := paths.New(cfg.DataDir) - imageMgr, err := images.NewManager(p, 1) + imageMgr, err := images.NewManager(p, 1, nil) if err != nil { t.Fatalf("failed to create image manager: %v", err) } systemMgr := system.NewManager(p) - networkMgr := network.NewManager(p, cfg) - volumeMgr := volumes.NewManager(p, 0) // 0 = unlimited storage + networkMgr := network.NewManager(p, cfg, nil) + volumeMgr := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := instances.ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB } - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil) // Register cleanup for orphaned Cloud Hypervisor processes t.Cleanup(func() { diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 0b59f6f0..ec810d7e 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -23,13 +23,23 @@ type Config struct { LogRotateInterval string // Resource limits - per instance - MaxVcpusPerInstance int // Max vCPUs for a single VM (0 = unlimited) - MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited) + MaxVcpusPerInstance int // Max vCPUs for a single VM (0 = unlimited) + MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited) // Resource limits - aggregate MaxTotalVcpus int // Aggregate vCPU limit across all instances (0 = unlimited) MaxTotalMemory string // Aggregate memory limit across all instances (0 = unlimited) MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited) + + // OpenTelemetry configuration + OtelEnabled bool // Enable OpenTelemetry + OtelEndpoint string // OTLP endpoint (gRPC) + OtelServiceName string // Service name for tracing + OtelInsecure bool // Disable TLS for OTLP + Version string // Application version for telemetry + + // Logging configuration + LogLevel string // Default log level (debug, info, warn, error) } // Load loads configuration from environment variables @@ -43,7 +53,7 @@ func Load() *Config { DataDir: getEnv("DATA_DIR", "/var/lib/hypeman"), BridgeName: getEnv("BRIDGE_NAME", "vmbr0"), SubnetCIDR: getEnv("SUBNET_CIDR", "10.100.0.0/16"), - SubnetGateway: getEnv("SUBNET_GATEWAY", ""), // empty = derived as first IP from subnet + SubnetGateway: getEnv("SUBNET_GATEWAY", ""), // empty = derived as first IP from subnet UplinkInterface: getEnv("UPLINK_INTERFACE", ""), // empty = auto-detect from default route JwtSecret: getEnv("JWT_SECRET", ""), DNSServer: getEnv("DNS_SERVER", "1.1.1.1"), @@ -61,6 +71,16 @@ func Load() *Config { MaxTotalVcpus: getEnvInt("MAX_TOTAL_VCPUS", 0), MaxTotalMemory: getEnv("MAX_TOTAL_MEMORY", ""), MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""), + + // OpenTelemetry configuration + OtelEnabled: getEnvBool("OTEL_ENABLED", false), + OtelEndpoint: getEnv("OTEL_ENDPOINT", "localhost:4317"), + OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"), + OtelInsecure: getEnvBool("OTEL_INSECURE", true), + Version: getEnv("VERSION", "dev"), + + // Logging configuration + LogLevel: getEnv("LOG_LEVEL", "info"), } return cfg @@ -82,3 +102,11 @@ func getEnvInt(key string, defaultValue int) int { return defaultValue } +func getEnvBool(key string, defaultValue bool) bool { + if value := os.Getenv(key); value != "" { + if boolVal, err := strconv.ParseBool(value); err == nil { + return boolVal + } + } + return defaultValue +} diff --git a/cmd/api/main.go b/cmd/api/main.go index d2ad80f2..4ff139b1 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -20,9 +20,13 @@ import ( nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware" "github.com/onkernel/hypeman" "github.com/onkernel/hypeman/cmd/api/api" + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/exec" "github.com/onkernel/hypeman/lib/instances" mw "github.com/onkernel/hypeman/lib/middleware" "github.com/onkernel/hypeman/lib/oapi" + "github.com/onkernel/hypeman/lib/otel" + "github.com/riandyrn/otelchi" "golang.org/x/sync/errgroup" ) @@ -34,6 +38,41 @@ func main() { } func run() error { + // Load config early for OTel initialization + cfg := config.Load() + + // Initialize OpenTelemetry (before wire initialization) + otelCfg := otel.Config{ + Enabled: cfg.OtelEnabled, + Endpoint: cfg.OtelEndpoint, + ServiceName: cfg.OtelServiceName, + Insecure: cfg.OtelInsecure, + Version: cfg.Version, + } + + otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg) + if err != nil { + // Log warning but don't fail - graceful degradation + slog.Warn("failed to initialize OpenTelemetry, continuing without telemetry", "error", err) + } + if otelShutdown != nil { + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := otelShutdown(shutdownCtx); err != nil { + slog.Warn("error shutting down OpenTelemetry", "error", err) + } + }() + } + + // Initialize exec metrics if OTel is enabled + if otelProvider != nil && otelProvider.Meter != nil { + execMetrics, err := exec.NewMetrics(otelProvider.Meter) + if err == nil { + exec.SetMetrics(execMetrics) + } + } + // Initialize app with wire app, cleanup, err := initializeApp() if err != nil { @@ -46,6 +85,11 @@ func run() error { logger := app.Logger + // Log OTel status + if cfg.OtelEnabled { + logger.Info("OpenTelemetry enabled", "endpoint", cfg.OtelEndpoint, "service", cfg.OtelServiceName) + } + // Validate JWT secret is configured if app.Config.JwtSecret == "" { logger.Warn("JWT_SECRET not configured - API authentication will fail") @@ -84,6 +128,24 @@ func run() error { // Create router r := chi.NewRouter() + // Add OpenTelemetry tracing middleware if enabled + if cfg.OtelEnabled { + r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r))) + } + + // Add HTTP metrics middleware if OTel is enabled + var httpMetricsMw func(http.Handler) http.Handler + if otelProvider != nil && otelProvider.Meter != nil { + httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter) + if err == nil { + httpMetricsMw = httpMetrics.Middleware + } + } + if httpMetricsMw == nil { + httpMetricsMw = mw.NoopHTTPMetrics() + } + r.Use(httpMetricsMw) + // Load OpenAPI spec for request validation spec, err := oapi.GetSwagger() if err != nil { @@ -225,4 +287,3 @@ func getRunningInstanceIDs(app *application) []string { } return running } - diff --git a/go.mod b/go.mod index 0f4f6cb0..7c894df1 100644 --- a/go.mod +++ b/go.mod @@ -22,10 +22,19 @@ require ( github.com/opencontainers/image-spec v1.1.1 github.com/opencontainers/runtime-spec v1.2.1 github.com/opencontainers/umoci v0.6.0 + github.com/riandyrn/otelchi v0.12.2 github.com/samber/lo v1.52.0 github.com/stretchr/testify v1.11.1 github.com/u-root/u-root v0.15.0 github.com/vishvananda/netlink v1.3.1 + go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/metric v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 golang.org/x/sync v0.17.0 golang.org/x/sys v0.38.0 golang.org/x/term v0.37.0 @@ -39,17 +48,22 @@ require ( github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/apex/log v1.9.0 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/docker/cli v28.2.2+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect github.com/dustin/go-humanize v1.0.1 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-test/deep v1.1.1 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/pgzip v1.2.6 // indirect @@ -66,7 +80,6 @@ require ( github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect github.com/sirupsen/logrus v1.9.4-0.20230606125235-dd1b4c2e81af // indirect github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 // indirect @@ -74,9 +87,13 @@ require ( github.com/vbatts/tar-split v0.12.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/woodsbury/decimal128 v1.3.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index ac155882..91d5185a 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 h1:6lhrsTEnloDPXyeZBvSYvQf8u86jbKehZPVDDlkgDl4= github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8= github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= @@ -36,6 +38,8 @@ github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/getkin/kin-openapi v0.133.0 h1:pJdmNohVIJ97r4AUFtEXRXwESr8b0bD721u/Tz6k8PQ= github.com/getkin/kin-openapi v0.133.0/go.mod h1:boAciF6cXk5FhPqe/NQeBTeenbjqU4LhWBf09ILVvWE= @@ -44,6 +48,7 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME github.com/go-chi/chi/v5 v5.2.3 h1:WQIt9uxdsAbgIYgid+BpYc+liqQZGMHRaUwp0JUcvdE= github.com/go-chi/chi/v5 v5.2.3/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -75,6 +80,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= @@ -144,9 +151,11 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/riandyrn/otelchi v0.12.2 h1:6QhGv0LVw/dwjtPd12mnNrl0oEQF4ZAlmHcnlTYbeAg= +github.com/riandyrn/otelchi v0.12.2/go.mod h1:weZZeUJURvtCcbWsdb7Y6F8KFZGedJlSrgUjq9VirV8= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 h1:58jvc5cZ+hGKidQ4Z37/+rj9eQxRRjOOsqNEwPSZXR4= github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67/go.mod h1:LLjEAc6zmycfeN7/1fxIphWQPjHpTt7ElqT7eVf8e4A= github.com/samber/lo v1.52.0 h1:Rvi+3BFHES3A8meP33VPAxiBZX/Aws5RxrschYGjomw= @@ -189,8 +198,16 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= @@ -199,6 +216,10 @@ go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6 go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= @@ -230,6 +251,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo= google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 h1:M1rk8KBnUsBDg1oPGHNCxG4vc1f49epmTO7xscSajMk= google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= diff --git a/lib/exec/client.go b/lib/exec/client.go index 0484bc96..4ca9fd40 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -9,6 +9,7 @@ import ( "net" "strings" "sync" + "sync/atomic" "time" "google.golang.org/grpc" @@ -111,6 +112,9 @@ func (c *bufferedConn) Read(p []byte) (int, error) { // ExecIntoInstance executes command in instance via vsock using gRPC // vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock) func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) { + start := time.Now() + var bytesSent int64 + // Get or create a reusable gRPC connection for this vsock socket // Connection pooling avoids issues with rapid connect/disconnect cycles grpcConn, err := getOrCreateConn(ctx, vsockSocketPath) @@ -146,13 +150,14 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti // Handle stdin in background if opts.Stdin != nil { go func() { - buf := make([]byte, 32 * 1024) + buf := make([]byte, 32*1024) for { n, err := opts.Stdin.Read(buf) if n > 0 { stream.Send(&ExecRequest{ Request: &ExecRequest_Stdin{Stdin: buf[:n]}, }) + atomic.AddInt64(&bytesSent, int64(n)) } if err != nil { stream.CloseSend() @@ -185,7 +190,13 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti opts.Stderr.Write(r.Stderr) } case *ExecResponse_ExitCode: - return &ExitStatus{Code: int(r.ExitCode)}, nil + exitCode := int(r.ExitCode) + // Record metrics + if ExecMetrics != nil { + bytesReceived := int64(totalStdout + totalStderr) + ExecMetrics.RecordSession(ctx, start, exitCode, atomic.LoadInt64(&bytesSent), bytesReceived) + } + return &ExitStatus{Code: exitCode}, nil } } } @@ -250,4 +261,3 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { // This ensures any bytes buffered during handshake are not lost return &bufferedConn{Conn: conn, reader: reader}, nil } - diff --git a/lib/exec/metrics.go b/lib/exec/metrics.go new file mode 100644 index 00000000..ef1c83c7 --- /dev/null +++ b/lib/exec/metrics.go @@ -0,0 +1,105 @@ +package exec + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for exec operations. +type Metrics struct { + sessionsTotal metric.Int64Counter + duration metric.Float64Histogram + bytesSentTotal metric.Int64Counter + bytesReceivedTotal metric.Int64Counter +} + +// ExecMetrics is the global metrics instance for the exec package. +// Set this via SetMetrics() during application initialization. +var ExecMetrics *Metrics + +// SetMetrics sets the global metrics instance. +func SetMetrics(m *Metrics) { + ExecMetrics = m +} + +// NewMetrics creates exec metrics instruments. +// If meter is nil, returns nil (metrics disabled). +func NewMetrics(meter metric.Meter) (*Metrics, error) { + if meter == nil { + return nil, nil + } + + sessionsTotal, err := meter.Int64Counter( + "hypeman_exec_sessions_total", + metric.WithDescription("Total number of exec sessions"), + ) + if err != nil { + return nil, err + } + + duration, err := meter.Float64Histogram( + "hypeman_exec_duration_seconds", + metric.WithDescription("Exec command duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + bytesSentTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_sent_total", + metric.WithDescription("Total bytes sent to guest (stdin)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + bytesReceivedTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_received_total", + metric.WithDescription("Total bytes received from guest (stdout+stderr)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + return &Metrics{ + sessionsTotal: sessionsTotal, + duration: duration, + bytesSentTotal: bytesSentTotal, + bytesReceivedTotal: bytesReceivedTotal, + }, nil +} + +// RecordSession records metrics for a completed exec session. +func (m *Metrics) RecordSession(ctx context.Context, start time.Time, exitCode int, bytesSent, bytesReceived int64) { + if m == nil { + return + } + + duration := time.Since(start).Seconds() + status := "success" + if exitCode != 0 { + status = "error" + } + + m.sessionsTotal.Add(ctx, 1, + metric.WithAttributes( + attribute.String("status", status), + attribute.Int("exit_code", exitCode), + )) + + m.duration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) + + if bytesSent > 0 { + m.bytesSentTotal.Add(ctx, bytesSent) + } + if bytesReceived > 0 { + m.bytesReceivedTotal.Add(ctx, bytesReceived) + } +} diff --git a/lib/images/manager.go b/lib/images/manager.go index 8d6a403b..d167c5bd 100644 --- a/lib/images/manager.go +++ b/lib/images/manager.go @@ -10,6 +10,8 @@ import ( "time" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" ) const ( @@ -28,15 +30,23 @@ type Manager interface { RecoverInterruptedBuilds() } +// Metrics holds the metrics instruments for image operations. +type Metrics struct { + buildDuration metric.Float64Histogram + pullsTotal metric.Int64Counter +} + type manager struct { paths *paths.Paths ociClient *ociClient queue *BuildQueue createMu sync.Mutex + metrics *Metrics } -// NewManager creates a new image manager -func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) { +// NewManager creates a new image manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Manager, error) { // Create cache directory under dataDir for OCI layouts cacheDir := p.SystemOCICache() ociClient, err := newOCIClient(cacheDir) @@ -49,10 +59,89 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) { ociClient: ociClient, queue: NewBuildQueue(maxConcurrentBuilds), } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newMetrics(meter, m) + if err != nil { + return nil, fmt.Errorf("create metrics: %w", err) + } + m.metrics = metrics + } + m.RecoverInterruptedBuilds() return m, nil } +// newMetrics creates and registers all image metrics. +func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + buildDuration, err := meter.Float64Histogram( + "hypeman_images_build_duration_seconds", + metric.WithDescription("Time to build an image"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + pullsTotal, err := meter.Int64Counter( + "hypeman_images_pulls_total", + metric.WithDescription("Total number of image pulls from registries"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges for queue length and total images + buildQueueLength, err := meter.Int64ObservableGauge( + "hypeman_images_build_queue_length", + metric.WithDescription("Current number of images in the build queue"), + ) + if err != nil { + return nil, err + } + + imagesTotal, err := meter.Int64ObservableGauge( + "hypeman_images_total", + metric.WithDescription("Total number of cached images"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + // Report queue length + o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength())) + + // Count images by status + metas, err := listAllTags(m.paths) + if err != nil { + return nil + } + statusCounts := make(map[string]int64) + for _, meta := range metas { + statusCounts[meta.Status]++ + } + for status, count := range statusCounts { + o.ObserveInt64(imagesTotal, count, + metric.WithAttributes(attribute.String("status", status))) + } + return nil + }, + buildQueueLength, + imagesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + buildDuration: buildDuration, + pullsTotal: pullsTotal, + }, nil +} + func (m *manager) ListImages(ctx context.Context) ([]Image, error) { metas, err := listAllTags(m.paths) if err != nil { @@ -78,7 +167,7 @@ func (m *manager) CreateImage(ctx context.Context, req CreateImageRequest) (*Ima // Add a 2-second timeout to ensure fast failure on rate limits or errors resolveCtx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() - + ref, err := normalized.Resolve(resolveCtx, m.ociClient) if err != nil { return nil, fmt.Errorf("resolve manifest: %w", err) @@ -134,11 +223,13 @@ func (m *manager) createAndQueueImage(ref *ResolvedRef) (*Image, error) { } func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { + buildStart := time.Now() buildDir := m.paths.SystemBuild(ref.String()) tempDir := filepath.Join(buildDir, "rootfs") if err := os.MkdirAll(buildDir, 0755); err != nil { m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("create build dir: %w", err)) + m.recordBuildMetrics(ctx, buildStart, "failed") return } @@ -153,8 +244,11 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { result, err := m.ociClient.pullAndExport(ctx, ref.String(), ref.Digest(), tempDir) if err != nil { m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("pull and export: %w", err)) + m.recordPullMetrics(ctx, "failed") + m.recordBuildMetrics(ctx, buildStart, "failed") return } + m.recordPullMetrics(ctx, "success") // Check if this digest already exists and is ready (deduplication) if meta, err := readMetadata(m.paths, ref.Repository(), ref.DigestHex()); err == nil { @@ -209,6 +303,27 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { fmt.Fprintf(os.Stderr, "Warning: failed to create tag symlink: %v\n", err) } } + + m.recordBuildMetrics(ctx, buildStart, "success") +} + +// recordBuildMetrics records the build duration metric. +func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.buildDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordPullMetrics records the pull counter metric. +func (m *manager) recordPullMetrics(ctx context.Context, status string) { + if m.metrics == nil { + return + } + m.metrics.pullsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("status", status))) } func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err error) { diff --git a/lib/images/manager_test.go b/lib/images/manager_test.go index 984a7ba3..406566ef 100644 --- a/lib/images/manager_test.go +++ b/lib/images/manager_test.go @@ -14,7 +14,7 @@ import ( func TestCreateImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -41,7 +41,7 @@ func TestCreateImage(t *testing.T) { ref, err := ParseNormalizedRef(img.Name) require.NoError(t, err) digestHex := strings.SplitN(img.Digest, ":", 2)[1] - + // Check erofs disk file diskPath := digestPath(paths.New(dataDir), ref.Repository(), digestHex) diskStat, err := os.Stat(diskPath) @@ -56,7 +56,7 @@ func TestCreateImage(t *testing.T) { metaStat, err := os.Stat(metadataPath) require.NoError(t, err) require.False(t, metaStat.IsDir(), "metadata should be a file") - + // Read and verify metadata content meta, err := readMetadata(paths.New(dataDir), ref.Repository(), digestHex) require.NoError(t, err) @@ -66,7 +66,7 @@ func TestCreateImage(t *testing.T) { require.Nil(t, meta.Error) require.Equal(t, diskStat.Size(), meta.SizeBytes) require.NotEmpty(t, meta.Env, "should have environment variables") - t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d", + t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d", meta.Name, meta.Digest, meta.Status, len(meta.Env)) // Check that tag symlink exists and points to correct digest @@ -74,7 +74,7 @@ func TestCreateImage(t *testing.T) { linkStat, err := os.Lstat(linkPath) require.NoError(t, err) require.NotEqual(t, 0, linkStat.Mode()&os.ModeSymlink, "should be a symlink") - + // Verify symlink points to digest directory linkTarget, err := os.Readlink(linkPath) require.NoError(t, err) @@ -84,7 +84,7 @@ func TestCreateImage(t *testing.T) { func TestCreateImageDifferentTag(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -98,7 +98,7 @@ func TestCreateImageDifferentTag(t *testing.T) { require.Equal(t, "docker.io/library/alpine:3.18", img.Name) waitForReady(t, mgr, ctx, img.Name) - + img, err = mgr.GetImage(ctx, img.Name) require.NoError(t, err) require.NotEmpty(t, img.Digest) @@ -106,7 +106,7 @@ func TestCreateImageDifferentTag(t *testing.T) { func TestCreateImageDuplicate(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -135,7 +135,7 @@ func TestCreateImageDuplicate(t *testing.T) { func TestListImages(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -164,7 +164,7 @@ func TestListImages(t *testing.T) { func TestGetImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -188,7 +188,7 @@ func TestGetImage(t *testing.T) { func TestGetImageNotFound(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -199,7 +199,7 @@ func TestGetImageNotFound(t *testing.T) { func TestDeleteImage(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -234,7 +234,7 @@ func TestDeleteImage(t *testing.T) { func TestDeleteImageNotFound(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -271,7 +271,7 @@ func TestNormalizedRefParsing(t *testing.T) { func TestLayerCaching(t *testing.T) { dataDir := t.TempDir() - mgr, err := NewManager(paths.New(dataDir), 1) + mgr, err := NewManager(paths.New(dataDir), 1, nil) require.NoError(t, err) ctx := context.Background() @@ -326,7 +326,7 @@ func TestLayerCaching(t *testing.T) { digestHex := strings.TrimPrefix(alpine1.Digest, "sha256:") disk1 := digestPath(paths.New(dataDir), alpine1Parsed.Repository(), digestHex) disk2 := digestPath(paths.New(dataDir), alpine2Parsed.Repository(), digestHex) - + require.Equal(t, disk1, disk2, "both references should point to same disk") stat, err := os.Stat(disk1) diff --git a/lib/images/queue.go b/lib/images/queue.go index 115d267c..5d08100a 100644 --- a/lib/images/queue.go +++ b/lib/images/queue.go @@ -113,3 +113,9 @@ func (q *BuildQueue) PendingCount() int { return len(q.pending) } +// QueueLength returns the total number of builds (active + pending) +func (q *BuildQueue) QueueLength() int { + q.mu.Lock() + defer q.mu.Unlock() + return len(q.active) + len(q.pending) +} diff --git a/lib/instances/create.go b/lib/instances/create.go index faa32279..b82e421d 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -15,6 +15,7 @@ import ( "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/vmm" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel/trace" "gvisor.dev/gvisor/pkg/cleanup" ) @@ -93,9 +94,17 @@ func (m *manager) createInstance( ctx context.Context, req CreateInstanceRequest, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "creating instance", "name", req.Name, "image", req.Image, "vcpus", req.Vcpus) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "CreateInstance") + defer span.End() + } + // 1. Validate request if err := validateCreateRequest(req); err != nil { log.ErrorContext(ctx, "invalid create request", "error", err) @@ -336,6 +345,12 @@ func (m *manager) createInstance( // Success - release cleanup stack (prevent cleanup) cu.Release() + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.createDuration, start, "success") + m.recordStateTransition(ctx, "stopped", string(StateRunning)) + } + // Return instance with derived state finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance created successfully", "id", id, "name", req.Name, "state", finalInst.State) @@ -452,14 +467,14 @@ func (m *manager) startAndBootVM( netConfig *network.NetworkConfig, ) error { log := logger.FromContext(ctx) - + // Start VMM process and capture PID log.DebugContext(ctx, "starting VMM process", "id", stored.Id, "version", stored.CHVersion) pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath) if err != nil { return fmt.Errorf("start vmm: %w", err) } - + // Store the PID for later cleanup stored.CHPID = &pid log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid) @@ -539,7 +554,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf BootVcpus: inst.Vcpus, MaxVcpus: inst.Vcpus, } - + // Calculate and set guest topology based on host topology if topology := calculateGuestTopology(inst.Vcpus, m.hostTopology); topology != nil { cpus.Topology = topology @@ -551,7 +566,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf } if inst.HotplugSize > 0 { memory.HotplugSize = &inst.HotplugSize - memory.HotplugMethod = ptr("VirtioMem") // PascalCase, not kebab-case + memory.HotplugMethod = ptr("VirtioMem") // PascalCase, not kebab-case } // Disk configuration @@ -644,4 +659,3 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf func ptr[T any](v T) *T { return &v } - diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index ef0ba8f0..cee4e71b 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -46,7 +46,7 @@ func TestExecConcurrent(t *testing.T) { p := paths.New(tmpDir) // Setup image - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling nginx:alpine image...") @@ -215,4 +215,3 @@ func TestExecConcurrent(t *testing.T) { t.Logf("Long-running streams completed in %v (concurrent OK)", streamElapsed) } - diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 0c939a5c..68e9b36c 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -4,12 +4,16 @@ import ( "context" "fmt" "sync" + "time" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" ) type Manager interface { @@ -35,6 +39,15 @@ type ResourceLimits struct { MaxTotalMemory int64 // Maximum total memory in bytes across all instances (0 = unlimited) } +// Metrics holds the metrics instruments for instance operations. +type Metrics struct { + createDuration metric.Float64Histogram + restoreDuration metric.Float64Histogram + standbyDuration metric.Float64Histogram + stateTransitions metric.Int64Counter + tracer trace.Tracer +} + type manager struct { paths *paths.Paths imageManager images.Manager @@ -44,11 +57,13 @@ type manager struct { limits ResourceLimits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks hostTopology *HostTopology // Cached host CPU topology + metrics *Metrics } -// NewManager creates a new instances manager -func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits) Manager { - return &manager{ +// NewManager creates a new instances manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits, meter metric.Meter, tracer trace.Tracer) Manager { + m := &manager{ paths: p, imageManager: imageManager, systemManager: systemManager, @@ -58,6 +73,115 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste instanceLocks: sync.Map{}, hostTopology: detectHostTopology(), // Detect and cache host topology } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newInstanceMetrics(meter, tracer, m) + if err == nil { + m.metrics = metrics + } + } + + return m +} + +// newInstanceMetrics creates and registers all instance metrics. +func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_instances_create_duration_seconds", + metric.WithDescription("Time to create an instance"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + restoreDuration, err := meter.Float64Histogram( + "hypeman_instances_restore_duration_seconds", + metric.WithDescription("Time to restore an instance from standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + standbyDuration, err := meter.Float64Histogram( + "hypeman_instances_standby_duration_seconds", + metric.WithDescription("Time to put an instance in standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + stateTransitions, err := meter.Int64Counter( + "hypeman_instances_state_transitions_total", + metric.WithDescription("Total number of instance state transitions"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for instance counts by state + instancesTotal, err := meter.Int64ObservableGauge( + "hypeman_instances_total", + metric.WithDescription("Total number of instances by state"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + instances, err := m.listInstances(ctx) + if err != nil { + return nil + } + stateCounts := make(map[string]int64) + for _, inst := range instances { + stateCounts[string(inst.State)]++ + } + for state, count := range stateCounts { + o.ObserveInt64(instancesTotal, count, + metric.WithAttributes(attribute.String("state", state))) + } + return nil + }, + instancesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + restoreDuration: restoreDuration, + standbyDuration: standbyDuration, + stateTransitions: stateTransitions, + tracer: tracer, + }, nil +} + +// recordDuration records operation duration. +func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + histogram.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordStateTransition records a state transition. +func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) { + if m.metrics == nil { + return + } + m.metrics.stateTransitions.Add(ctx, 1, + metric.WithAttributes( + attribute.String("from", fromState), + attribute.String("to", toState), + )) } // getInstanceLock returns or creates a lock for a specific instance @@ -81,7 +205,7 @@ func (m *manager) DeleteInstance(ctx context.Context, id string) error { lock := m.getInstanceLock(id) lock.Lock() defer lock.Unlock() - + err := m.deleteInstance(ctx, id) if err == nil { // Clean up the lock after successful deletion diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 4b533e0a..8534cdf4 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -26,21 +26,21 @@ import ( // setupTestManager creates a manager and registers cleanup for any orphaned processes func setupTestManager(t *testing.T) (*manager, string) { tmpDir := t.TempDir() - + cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", + DataDir: tmpDir, + BridgeName: "vmbr0", SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", + DNSServer: "1.1.1.1", } - + p := paths.New(tmpDir) - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) - + systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited @@ -48,20 +48,20 @@ func setupTestManager(t *testing.T) (*manager, string) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) - + mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) + // Register cleanup to kill any orphaned Cloud Hypervisor processes t.Cleanup(func() { cleanupOrphanedProcesses(t, mgr) }) - + return mgr, tmpDir } // waitForVMReady polls VM state via VMM API until it's running or times out func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duration) error { deadline := time.Now().Add(timeout) - + for time.Now().Before(deadline) { // Try to connect to VMM client, err := vmm.NewVMM(socketPath) @@ -70,48 +70,48 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio time.Sleep(100 * time.Millisecond) continue } - + // Get VM info infoResp, err := client.GetVmInfoWithResponse(ctx) if err != nil { time.Sleep(100 * time.Millisecond) continue } - + if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { time.Sleep(100 * time.Millisecond) continue } - + // Check if VM is running if infoResp.JSON200.State == vmm.Running { return nil } - + time.Sleep(100 * time.Millisecond) } - + return fmt.Errorf("VM did not reach running state within %v", timeout) } // waitForLogMessage polls instance logs until the message appears or times out func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error { deadline := time.Now().Add(timeout) - + for time.Now().Before(deadline) { logs, err := collectLogs(ctx, mgr, instanceID, 200) if err != nil { time.Sleep(100 * time.Millisecond) continue } - + if strings.Contains(logs, message) { return nil } - + time.Sleep(100 * time.Millisecond) } - + return fmt.Errorf("message %q not found in logs within %v", message, timeout) } @@ -121,12 +121,12 @@ func collectLogs(ctx context.Context, mgr *manager, instanceID string, n int) (s if err != nil { return "", err } - + var lines []string for line := range logChan { lines = append(lines, line) } - + return strings.Join(lines, "\n"), nil } @@ -137,26 +137,26 @@ func cleanupOrphanedProcesses(t *testing.T, mgr *manager) { if err != nil { return // No metadata files, nothing to clean } - + for _, metaFile := range metaFiles { // Extract instance ID from path id := filepath.Base(filepath.Dir(metaFile)) - + // Load metadata meta, err := mgr.loadMetadata(id) if err != nil { continue } - + // If metadata has a PID, try to kill it if meta.CHPID != nil { pid := *meta.CHPID - + // Check if process exists if err := syscall.Kill(pid, 0); err == nil { t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d (instance %s)", pid, id) syscall.Kill(pid, syscall.SIGKILL) - + // Wait for process to exit WaitForProcessExit(pid, 1*time.Second) } @@ -174,7 +174,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { ctx := context.Background() // Get the image manager from the manager (we need it for image operations) - imageManager, err := images.NewManager(paths.New(tmpDir), 1) + imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil) require.NoError(t, err) // Pull nginx image (runs a daemon, won't exit) @@ -210,7 +210,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Create a volume to attach p := paths.New(tmpDir) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage t.Log("Creating volume...") vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "test-data", @@ -226,12 +226,12 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Create instance with real nginx image and attached volume req := CreateInstanceRequest{ - Name: "test-nginx", - Image: "docker.io/library/nginx:alpine", - Size: 512 * 1024 * 1024, // 512MB - HotplugSize: 512 * 1024 * 1024, // 512MB - OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB - Vcpus: 1, + Name: "test-nginx", + Image: "docker.io/library/nginx:alpine", + Size: 512 * 1024 * 1024, // 512MB + HotplugSize: 512 * 1024 * 1024, // 512MB + OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB + Vcpus: 1, NetworkEnabled: false, // No network for tests Env: map[string]string{ "TEST_VAR": "test_value", @@ -299,33 +299,33 @@ func TestCreateAndDeleteInstance(t *testing.T) { for i := 0; i < 50; i++ { // Poll for up to 5 seconds (50 * 100ms) logs, err = collectLogs(ctx, manager, inst.Id, 100) require.NoError(t, err) - + if strings.Contains(logs, "start worker processes") { foundNginxStartup = true break } time.Sleep(100 * time.Millisecond) } - + t.Logf("Instance logs (last 100 lines):\n%s", logs) - + // Verify nginx started successfully assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 5 seconds") // Test volume is accessible from inside the guest via exec t.Log("Testing volume from inside guest via exec...") - + // Helper to run command in guest with retry (exec agent may need time between connections) runCmd := func(command ...string) (string, int, error) { var lastOutput string var lastExitCode int var lastErr error - + for attempt := 0; attempt < 5; attempt++ { if attempt > 0 { time.Sleep(200 * time.Millisecond) } - + var stdout, stderr bytes.Buffer exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ Command: command, @@ -333,31 +333,31 @@ func TestCreateAndDeleteInstance(t *testing.T) { Stderr: &stderr, TTY: false, }) - + // Combine stdout and stderr output := stdout.String() if stderr.Len() > 0 { output += stderr.String() } output = strings.TrimSpace(output) - + if err != nil { lastErr = err lastOutput = output lastExitCode = -1 continue } - + lastOutput = output lastExitCode = exit.Code lastErr = nil - + // Success if we got output or it's a command expected to have no output if output != "" || exit.Code == 0 { return output, exit.Code, nil } } - + return lastOutput, lastExitCode, lastErr } @@ -375,11 +375,11 @@ func TestCreateAndDeleteInstance(t *testing.T) { echo "=== Volume mount info ===" df -h /mnt/data `, testContent) - + output, exitCode, err := runCmd("sh", "-c", script) require.NoError(t, err, "Volume test script should execute") require.Equal(t, 0, exitCode, "Volume test script should succeed") - + // Verify all expected output is present require.Contains(t, output, "lost+found", "Volume should be ext4-formatted") require.Contains(t, output, testContent, "Should be able to read written content") @@ -455,7 +455,7 @@ func TestCreateAndDeleteInstance(t *testing.T) { // Verify volume is gone _, err = volumeManager.GetVolume(ctx, vol.Id) assert.ErrorIs(t, err, volumes.ErrNotFound) - + t.Log("Instance and volume lifecycle test complete!") } @@ -464,17 +464,17 @@ func TestStorageOperations(t *testing.T) { tmpDir := t.TempDir() cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", + DataDir: tmpDir, + BridgeName: "vmbr0", SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", + DNSServer: "1.1.1.1", } p := paths.New(tmpDir) - imageManager, _ := images.NewManager(p, 1) + imageManager, _ := images.NewManager(p, 1, nil) systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage limits := ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited @@ -482,7 +482,7 @@ func TestStorageOperations(t *testing.T) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) + manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) // Test metadata doesn't exist initially _, err := manager.loadMetadata("nonexistent") @@ -544,7 +544,7 @@ func TestStandbyAndRestore(t *testing.T) { ctx := context.Background() // Create image manager for pulling nginx - imageManager, err := images.NewManager(paths.New(tmpDir), 1) + imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil) require.NoError(t, err) // Pull nginx image (reuse if already pulled in previous test) @@ -577,14 +577,14 @@ func TestStandbyAndRestore(t *testing.T) { // Create instance t.Log("Creating instance...") req := CreateInstanceRequest{ - Name: "test-standby", - Image: "docker.io/library/nginx:alpine", - Size: 512 * 1024 * 1024, - HotplugSize: 512 * 1024 * 1024, - OverlaySize: 10 * 1024 * 1024 * 1024, - Vcpus: 1, + Name: "test-standby", + Image: "docker.io/library/nginx:alpine", + Size: 512 * 1024 * 1024, + HotplugSize: 512 * 1024 * 1024, + OverlaySize: 10 * 1024 * 1024 * 1024, + Vcpus: 1, NetworkEnabled: false, // No network for tests - Env: map[string]string{}, + Env: map[string]string{}, } inst, err := manager.CreateInstance(ctx, req) @@ -610,7 +610,7 @@ func TestStandbyAndRestore(t *testing.T) { assert.DirExists(t, snapshotDir) assert.FileExists(t, filepath.Join(snapshotDir, "memory-ranges")) // Cloud Hypervisor creates various snapshot files, just verify directory exists - + // DEBUG: Check snapshot files (for comparison with networking test) t.Log("DEBUG: Snapshot files for non-network instance:") entries, _ := os.ReadDir(snapshotDir) @@ -618,7 +618,7 @@ func TestStandbyAndRestore(t *testing.T) { info, _ := entry.Info() t.Logf(" - %s (size: %d bytes)", entry.Name(), info.Size()) } - + // DEBUG: Check console.log file size before restore consoleLogPath := filepath.Join(tmpDir, "guests", inst.Id, "logs", "console.log") var consoleLogSizeBefore int64 @@ -633,7 +633,7 @@ func TestStandbyAndRestore(t *testing.T) { require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) t.Log("Instance restored and running") - + // DEBUG: Check console.log file size after restore if info, err := os.Stat(consoleLogPath); err == nil { consoleLogSizeAfter := info.Size() @@ -648,7 +648,7 @@ func TestStandbyAndRestore(t *testing.T) { t.Log("Cleaning up...") err = manager.DeleteInstance(ctx, inst.Id) require.NoError(t, err) - + t.Log("Standby/restore test complete!") } @@ -685,6 +685,4 @@ func TestStateTransitions(t *testing.T) { } } - // No mock image manager needed - tests use real images! - diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go index ac27b561..83930421 100644 --- a/lib/instances/resource_limits_test.go +++ b/lib/instances/resource_limits_test.go @@ -154,23 +154,23 @@ func createTestManager(t *testing.T, limits ResourceLimits) *manager { cfg := &config.Config{DataDir: tmpDir} p := paths.New(cfg.DataDir) - imageMgr, err := images.NewManager(p, 1) + imageMgr, err := images.NewManager(p, 1, nil) require.NoError(t, err) systemMgr := system.NewManager(p) - networkMgr := network.NewManager(p, cfg) - volumeMgr := volumes.NewManager(p, 0) + networkMgr := network.NewManager(p, cfg, nil) + volumeMgr := volumes.NewManager(p, 0, nil) - return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits).(*manager) + return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil).(*manager) } func TestResourceLimits_StructValues(t *testing.T) { limits := ResourceLimits{ - MaxOverlaySize: 10 * 1024 * 1024 * 1024, // 10GB + MaxOverlaySize: 10 * 1024 * 1024 * 1024, // 10GB MaxVcpusPerInstance: 4, - MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB + MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB MaxTotalVcpus: 16, - MaxTotalMemory: 32 * 1024 * 1024 * 1024, // 32GB + MaxTotalMemory: 32 * 1024 * 1024 * 1024, // 32GB } assert.Equal(t, int64(10*1024*1024*1024), limits.MaxOverlaySize) @@ -246,12 +246,12 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { } p := paths.New(tmpDir) - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg) - volumeManager := volumes.NewManager(p, 0) + networkManager := network.NewManager(p, cfg, nil) + volumeManager := volumes.NewManager(p, 0, nil) // Set small aggregate limits: // - MaxTotalVcpus: 2 (first VM gets 1, second wants 2 -> denied) @@ -264,7 +264,7 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { MaxTotalMemory: 2 * 1024 * 1024 * 1024, // aggregate: only 2GB total } - mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager) + mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager) // Cleanup any orphaned processes on test end t.Cleanup(func() { @@ -375,4 +375,3 @@ func cleanupTestProcesses(t *testing.T, mgr *manager) { } } } - diff --git a/lib/instances/restore.go b/lib/instances/restore.go index ffce3d36..53b7b3e9 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -8,18 +8,27 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/vmm" + "go.opentelemetry.io/otel/trace" ) // RestoreInstance restores an instance from standby // Multi-hop orchestration: Standby → Paused → Running func (m *manager) restoreInstance( ctx context.Context, - + id string, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "restoring instance from standby", "id", id) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "RestoreInstance") + defer span.End() + } + // 1. Load instance meta, err := m.loadMetadata(id) if err != nil { @@ -108,6 +117,12 @@ func (m *manager) restoreInstance( log.WarnContext(ctx, "failed to update metadata after restore", "id", id, "error", err) } + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.restoreDuration, start, "success") + m.recordStateTransition(ctx, string(StateStandby), string(StateRunning)) + } + // Return instance with derived state (should be Running now) finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance restored successfully", "id", id, "state", finalInst.State) @@ -121,14 +136,14 @@ func (m *manager) restoreFromSnapshot( snapshotDir string, ) error { log := logger.FromContext(ctx) - + // Start VMM process and capture PID log.DebugContext(ctx, "starting VMM process for restore", "id", stored.Id, "version", stored.CHVersion) pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath) if err != nil { return fmt.Errorf("start vmm: %w", err) } - + // Store the PID for later cleanup stored.CHPID = &pid log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid) @@ -162,5 +177,3 @@ func (m *manager) restoreFromSnapshot( log.DebugContext(ctx, "VM restored from snapshot successfully", "id", stored.Id) return nil } - - diff --git a/lib/instances/standby.go b/lib/instances/standby.go index ad745164..68e67661 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -9,18 +9,27 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/vmm" + "go.opentelemetry.io/otel/trace" ) // StandbyInstance puts an instance in standby state // Multi-hop orchestration: Running → Paused → Standby func (m *manager) standbyInstance( ctx context.Context, - + id string, ) (*Instance, error) { + start := time.Now() log := logger.FromContext(ctx) log.InfoContext(ctx, "putting instance in standby", "id", id) - + + // Start tracing span if tracer is available + if m.metrics != nil && m.metrics.tracer != nil { + var span trace.Span + ctx, span = m.metrics.tracer.Start(ctx, "StandbyInstance") + defer span.End() + } + // 1. Load instance meta, err := m.loadMetadata(id) if err != nil { @@ -110,6 +119,12 @@ func (m *manager) standbyInstance( return nil, fmt.Errorf("save metadata: %w", err) } + // Record metrics + if m.metrics != nil { + m.recordDuration(ctx, m.metrics.standbyDuration, start, "success") + m.recordStateTransition(ctx, string(StateRunning), string(StateStandby)) + } + // Return instance with derived state (should be Standby now) finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance put in standby successfully", "id", id, "state", finalInst.State) @@ -131,34 +146,34 @@ func reduceMemory(ctx context.Context, client *vmm.VMM, targetBytes int64) error func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeout time.Duration) error { log := logger.FromContext(ctx) deadline := time.Now().Add(timeout) - + // Use 20ms for fast response with minimal overhead const pollInterval = 20 * time.Millisecond const stabilityThreshold = 3 // Memory unchanged for 3 checks = stable - + var previousSize *int64 unchangedCount := 0 - + for time.Now().Before(deadline) { infoResp, err := client.GetVmInfoWithResponse(ctx) if err != nil { time.Sleep(pollInterval) continue } - + if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { time.Sleep(pollInterval) continue } - + actualSize := infoResp.JSON200.MemoryActualSize if actualSize == nil { time.Sleep(pollInterval) continue } - + currentSize := *actualSize - + // Best case: reached target or below if currentSize <= targetBytes { log.DebugContext(ctx, "memory reduced to target", @@ -166,7 +181,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo "actual_mb", currentSize/(1024*1024)) return nil } - + // Check if memory has stopped shrinking (stabilized above target) if previousSize != nil { if currentSize == *previousSize { @@ -185,11 +200,11 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo unchangedCount = 0 } } - + previousSize = ¤tSize time.Sleep(pollInterval) } - + // Timeout - memory never stabilized return fmt.Errorf("memory reduction did not complete within %v", timeout) } @@ -197,7 +212,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo // createSnapshot creates a Cloud Hypervisor snapshot func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) error { log := logger.FromContext(ctx) - + // Remove old snapshot os.RemoveAll(snapshotDir) @@ -227,7 +242,7 @@ func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) er // shutdownVMM gracefully shuts down the VMM process via API func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { log := logger.FromContext(ctx) - + // Try to connect to VMM client, err := vmm.NewVMM(inst.SocketPath) if err != nil { @@ -239,7 +254,7 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { // Try graceful shutdown log.DebugContext(ctx, "sending shutdown command to VMM", "id", inst.Id) client.ShutdownVMMWithResponse(ctx) - + // Wait for process to exit if inst.CHPID != nil { if !WaitForProcessExit(*inst.CHPID, 2*time.Second) { @@ -248,7 +263,6 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error { log.DebugContext(ctx, "VMM shutdown gracefully", "id", inst.Id, "pid", *inst.CHPID) } } - + return nil } - diff --git a/lib/instances/volumes_test.go b/lib/instances/volumes_test.go index c8a36816..5e46d007 100644 --- a/lib/instances/volumes_test.go +++ b/lib/instances/volumes_test.go @@ -67,7 +67,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -92,7 +92,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { t.Log("System files ready") // Create volume - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) t.Log("Creating volume...") vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "shared-data", @@ -250,7 +250,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -272,7 +272,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) { require.NoError(t, err) // Create volume - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ Name: "cleanup-test-vol", SizeGb: 1, @@ -359,7 +359,7 @@ func TestVolumeFromArchive(t *testing.T) { p := paths.New(tmpDir) // Setup: prepare image and system files - imageManager, err := images.NewManager(p, 1) + imageManager, err := images.NewManager(p, 1, nil) require.NoError(t, err) t.Log("Pulling alpine image...") @@ -393,7 +393,7 @@ func TestVolumeFromArchive(t *testing.T) { archive := createTestTarGz(t, testFiles) // Create volume from archive - volumeManager := volumes.NewManager(p, 0) + volumeManager := volumes.NewManager(p, 0, nil) t.Log("Creating volume from archive...") vol, err := volumeManager.CreateVolumeFromArchive(ctx, volumes.CreateVolumeFromArchiveRequest{ Name: "archive-data", diff --git a/lib/logger/README.md b/lib/logger/README.md new file mode 100644 index 00000000..4996bfeb --- /dev/null +++ b/lib/logger/README.md @@ -0,0 +1,55 @@ +# Logger + +Structured logging with per-subsystem log levels and OpenTelemetry trace context integration. + +## Features + +- Per-subsystem configurable log levels +- Automatic trace_id/span_id injection when OTel is active +- Context-based logger propagation +- JSON output format + +## Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `LOG_LEVEL` | Default log level | `info` | +| `LOG_LEVEL_` | Per-subsystem level | inherits default | + +Subsystems: `API`, `IMAGES`, `INSTANCES`, `NETWORK`, `VOLUMES`, `VMM`, `SYSTEM`, `EXEC` + +Example: +```bash +LOG_LEVEL=info LOG_LEVEL_NETWORK=debug ./hypeman +``` + +## Usage + +```go +// Create subsystem-specific logger +cfg := logger.NewConfig() +log := logger.NewSubsystemLogger(logger.SubsystemInstances, cfg) + +// Add logger to context +ctx = logger.AddToContext(ctx, log) + +// Retrieve from context +log = logger.FromContext(ctx) +log.InfoContext(ctx, "instance created", "id", instanceID) +``` + +## Output + +When OTel tracing is active, logs include trace context: + +```json +{ + "level": "INFO", + "msg": "instance created", + "subsystem": "INSTANCES", + "trace_id": "abc123...", + "span_id": "def456...", + "id": "instance-123" +} +``` + diff --git a/lib/logger/logger.go b/lib/logger/logger.go index b4ac1528..23a69ef3 100644 --- a/lib/logger/logger.go +++ b/lib/logger/logger.go @@ -1,20 +1,165 @@ +// Package logger provides structured logging with subsystem-specific levels +// and OpenTelemetry trace context integration. package logger import ( "context" "log/slog" + "os" + "strings" + + "go.opentelemetry.io/otel/trace" ) type contextKey string const loggerKey contextKey = "logger" -// AddToContext adds a logger to the context +// Subsystem names for per-subsystem logging configuration. +const ( + SubsystemAPI = "API" + SubsystemImages = "IMAGES" + SubsystemInstances = "INSTANCES" + SubsystemNetwork = "NETWORK" + SubsystemVolumes = "VOLUMES" + SubsystemVMM = "VMM" + SubsystemSystem = "SYSTEM" + SubsystemExec = "EXEC" +) + +// Config holds logging configuration. +type Config struct { + // DefaultLevel is the default log level for all subsystems. + DefaultLevel slog.Level + // SubsystemLevels maps subsystem names to their specific log levels. + // If a subsystem is not in this map, DefaultLevel is used. + SubsystemLevels map[string]slog.Level + // AddSource adds source file information to log entries. + AddSource bool +} + +// NewConfig creates a Config from environment variables. +// Reads LOG_LEVEL for default level and LOG_LEVEL_ for per-subsystem levels. +func NewConfig() Config { + cfg := Config{ + DefaultLevel: slog.LevelInfo, + SubsystemLevels: make(map[string]slog.Level), + AddSource: false, + } + + // Parse default level + if levelStr := os.Getenv("LOG_LEVEL"); levelStr != "" { + cfg.DefaultLevel = parseLevel(levelStr) + } + + // Parse subsystem-specific levels + subsystems := []string{ + SubsystemAPI, SubsystemImages, SubsystemInstances, + SubsystemNetwork, SubsystemVolumes, SubsystemVMM, + SubsystemSystem, SubsystemExec, + } + for _, subsystem := range subsystems { + envKey := "LOG_LEVEL_" + subsystem + if levelStr := os.Getenv(envKey); levelStr != "" { + cfg.SubsystemLevels[subsystem] = parseLevel(levelStr) + } + } + + return cfg +} + +// parseLevel parses a log level string. +func parseLevel(s string) slog.Level { + switch strings.ToLower(s) { + case "debug": + return slog.LevelDebug + case "info": + return slog.LevelInfo + case "warn", "warning": + return slog.LevelWarn + case "error": + return slog.LevelError + default: + return slog.LevelInfo + } +} + +// LevelFor returns the log level for the given subsystem. +func (c Config) LevelFor(subsystem string) slog.Level { + if level, ok := c.SubsystemLevels[subsystem]; ok { + return level + } + return c.DefaultLevel +} + +// NewLogger creates a new slog.Logger with JSON output. +func NewLogger(cfg Config) *slog.Logger { + return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: cfg.DefaultLevel, + AddSource: cfg.AddSource, + })) +} + +// NewSubsystemLogger creates a logger for a specific subsystem with its configured level. +func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger { + level := cfg.LevelFor(subsystem) + handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: level, + AddSource: cfg.AddSource, + }) + // Wrap with trace context handler if we want trace IDs in logs + wrappedHandler := &traceContextHandler{ + Handler: handler, + subsystem: subsystem, + } + return slog.New(wrappedHandler) +} + +// traceContextHandler wraps a slog.Handler to add trace context and subsystem. +type traceContextHandler struct { + slog.Handler + subsystem string +} + +// Handle adds trace_id and span_id from the context if available. +func (h *traceContextHandler) Handle(ctx context.Context, r slog.Record) error { + // Add subsystem attribute + r.AddAttrs(slog.String("subsystem", h.subsystem)) + + // Add trace context if available + spanCtx := trace.SpanContextFromContext(ctx) + if spanCtx.IsValid() { + r.AddAttrs( + slog.String("trace_id", spanCtx.TraceID().String()), + slog.String("span_id", spanCtx.SpanID().String()), + ) + } + + return h.Handler.Handle(ctx, r) +} + +// WithAttrs returns a new handler with the given attributes. +func (h *traceContextHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + return &traceContextHandler{ + Handler: h.Handler.WithAttrs(attrs), + subsystem: h.subsystem, + } +} + +// WithGroup returns a new handler with the given group name. +func (h *traceContextHandler) WithGroup(name string) slog.Handler { + return &traceContextHandler{ + Handler: h.Handler.WithGroup(name), + subsystem: h.subsystem, + } +} + +// AddToContext adds a logger to the context. func AddToContext(ctx context.Context, logger *slog.Logger) context.Context { return context.WithValue(ctx, loggerKey, logger) } -// FromContext retrieves the logger from context, or returns default +// FromContext retrieves the logger from context, or returns default. func FromContext(ctx context.Context) *slog.Logger { if logger, ok := ctx.Value(loggerKey).(*slog.Logger); ok { return logger @@ -22,3 +167,7 @@ func FromContext(ctx context.Context) *slog.Logger { return slog.Default() } +// With returns a logger with additional attributes. +func With(logger *slog.Logger, args ...any) *slog.Logger { + return logger.With(args...) +} diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go new file mode 100644 index 00000000..823aef08 --- /dev/null +++ b/lib/middleware/otel.go @@ -0,0 +1,99 @@ +package middleware + +import ( + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// HTTPMetrics holds the OTel metrics for HTTP requests. +type HTTPMetrics struct { + requestsTotal metric.Int64Counter + requestDuration metric.Float64Histogram +} + +// NewHTTPMetrics creates new HTTP metrics instruments. +func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) { + requestsTotal, err := meter.Int64Counter( + "hypeman_http_requests_total", + metric.WithDescription("Total number of HTTP requests"), + ) + if err != nil { + return nil, err + } + + requestDuration, err := meter.Float64Histogram( + "hypeman_http_request_duration_seconds", + metric.WithDescription("HTTP request duration in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + return &HTTPMetrics{ + requestsTotal: requestsTotal, + requestDuration: requestDuration, + }, nil +} + +// Middleware returns an HTTP middleware that records metrics. +func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + + // Wrap response writer to capture status code + wrapped := &responseWriter{ + ResponseWriter: w, + statusCode: http.StatusOK, + } + + // Process request + next.ServeHTTP(wrapped, r) + + // Calculate duration + duration := time.Since(start).Seconds() + + // Get route pattern if available (chi specific) + routePattern := chi.RouteContext(r.Context()).RoutePattern() + if routePattern == "" { + routePattern = r.URL.Path + } + + // Record metrics + attrs := []attribute.KeyValue{ + attribute.String("method", r.Method), + attribute.String("path", routePattern), + attribute.Int("status", wrapped.statusCode), + } + + m.requestsTotal.Add(r.Context(), 1, metric.WithAttributes(attrs...)) + m.requestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...)) + }) +} + +// responseWriter wraps http.ResponseWriter to capture the status code. +type responseWriter struct { + http.ResponseWriter + statusCode int +} + +func (rw *responseWriter) WriteHeader(code int) { + rw.statusCode = code + rw.ResponseWriter.WriteHeader(code) +} + +// Unwrap provides access to the underlying ResponseWriter for http.ResponseController. +func (rw *responseWriter) Unwrap() http.ResponseWriter { + return rw.ResponseWriter +} + +// NoopHTTPMetrics returns a middleware that does nothing (for when OTel is disabled). +func NoopHTTPMetrics() func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return next + } +} diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 5396627d..b3f35900 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -59,6 +59,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N if err := m.createTAPDevice(tap, network.Bridge, network.Isolated); err != nil { return nil, fmt.Errorf("create TAP device: %w", err) } + m.recordTAPOperation(ctx, "create") log.InfoContext(ctx, "allocated network", "instance_id", req.InstanceID, @@ -111,6 +112,7 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string) err if err := m.createTAPDevice(alloc.TAPDevice, network.Bridge, network.Isolated); err != nil { return fmt.Errorf("create TAP device: %w", err) } + m.recordTAPOperation(ctx, "create") log.InfoContext(ctx, "recreated network for restore", "instance_id", instanceID, @@ -138,6 +140,8 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro // 1. Delete TAP device (best effort) if err := m.deleteTAPDevice(alloc.TAPDevice); err != nil { log.WarnContext(ctx, "failed to delete TAP device", "tap", alloc.TAPDevice, "error", err) + } else { + m.recordTAPOperation(ctx, "delete") } log.InfoContext(ctx, "released network", @@ -270,4 +274,3 @@ func generateTAPName(instanceID string) string { } return TAPPrefix + strings.ToLower(shortID) } - diff --git a/lib/network/manager.go b/lib/network/manager.go index cb43af07..51e4b13d 100644 --- a/lib/network/manager.go +++ b/lib/network/manager.go @@ -9,6 +9,8 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" ) // Manager defines the interface for network management @@ -27,19 +29,84 @@ type Manager interface { NameExists(ctx context.Context, name string) (bool, error) } +// Metrics holds the metrics instruments for network operations. +type Metrics struct { + tapOperations metric.Int64Counter +} + // manager implements the Manager interface type manager struct { - paths *paths.Paths - config *config.Config - mu sync.Mutex // Protects network allocation operations (IP allocation) + paths *paths.Paths + config *config.Config + mu sync.Mutex // Protects network allocation operations (IP allocation) + metrics *Metrics } -// NewManager creates a new network manager -func NewManager(p *paths.Paths, cfg *config.Config) Manager { - return &manager{ +// NewManager creates a new network manager. +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, cfg *config.Config, meter metric.Meter) Manager { + m := &manager{ paths: p, config: cfg, } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newNetworkMetrics(meter, m) + if err == nil { + m.metrics = metrics + } + } + + return m +} + +// newNetworkMetrics creates and registers all network metrics. +func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + tapOperations, err := meter.Int64Counter( + "hypeman_network_tap_operations_total", + metric.WithDescription("Total number of TAP device operations"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for allocations + allocationsTotal, err := meter.Int64ObservableGauge( + "hypeman_network_allocations_total", + metric.WithDescription("Total number of active network allocations"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + allocs, err := m.ListAllocations(ctx) + if err != nil { + return nil + } + o.ObserveInt64(allocationsTotal, int64(len(allocs))) + return nil + }, + allocationsTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + tapOperations: tapOperations, + }, nil +} + +// recordTAPOperation records a TAP device operation. +func (m *manager) recordTAPOperation(ctx context.Context, operation string) { + if m.metrics == nil { + return + } + m.metrics.tapOperations.Add(ctx, 1, + metric.WithAttributes(attribute.String("operation", operation))) } // Initialize initializes the network manager and creates default network. @@ -100,4 +167,3 @@ func (m *manager) getDefaultNetwork(ctx context.Context) (*Network, error) { CreatedAt: time.Time{}, // Unknown for default }, nil } - diff --git a/lib/otel/README.md b/lib/otel/README.md new file mode 100644 index 00000000..0f0187a7 --- /dev/null +++ b/lib/otel/README.md @@ -0,0 +1,92 @@ +# OpenTelemetry + +Provides OpenTelemetry initialization and metric definitions for Hypeman. + +## Features + +- OTLP export for traces and metrics (gRPC) +- Runtime metrics (Go GC, goroutines, memory) +- Application-specific metrics per subsystem +- Graceful degradation (failures don't crash the app) + +## Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_ENABLED` | Enable OpenTelemetry | `false` | +| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` | +| `OTEL_SERVICE_NAME` | Service name | `hypeman` | +| `OTEL_INSECURE` | Disable TLS for OTLP | `true` | + +## Metrics + +### System +| Metric | Type | Description | +|--------|------|-------------| +| `hypeman_uptime_seconds` | gauge | Process uptime | +| `hypeman_info` | gauge | Build info (version, go_version labels) | + +### HTTP +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_http_requests_total` | counter | method, path, status | Total HTTP requests | +| `hypeman_http_request_duration_seconds` | histogram | method, path, status | Request latency | + +### Images +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_images_build_queue_length` | gauge | | Current build queue size | +| `hypeman_images_build_duration_seconds` | histogram | status | Image build time | +| `hypeman_images_total` | gauge | status | Cached images count | +| `hypeman_images_pulls_total` | counter | status | Registry pulls | + +### Instances +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_instances_total` | gauge | state | Instances by state | +| `hypeman_instances_create_duration_seconds` | histogram | status | Create time | +| `hypeman_instances_restore_duration_seconds` | histogram | status | Restore time | +| `hypeman_instances_standby_duration_seconds` | histogram | status | Standby time | +| `hypeman_instances_state_transitions_total` | counter | from, to | State transitions | + +### Network +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_network_allocations_total` | gauge | | Active IP allocations | +| `hypeman_network_tap_operations_total` | counter | operation | TAP create/delete ops | + +### Volumes +| Metric | Type | Description | +|--------|------|-------------| +| `hypeman_volumes_total` | gauge | Volume count | +| `hypeman_volumes_allocated_bytes` | gauge | Total provisioned size | +| `hypeman_volumes_used_bytes` | gauge | Actual disk space consumed | +| `hypeman_volumes_create_duration_seconds` | histogram | Creation time | + +### VMM +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_vmm_api_duration_seconds` | histogram | operation, status | CH API latency | +| `hypeman_vmm_api_errors_total` | counter | operation | CH API errors | + +### Exec +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `hypeman_exec_sessions_total` | counter | status, exit_code | Exec sessions | +| `hypeman_exec_duration_seconds` | histogram | status | Command duration | +| `hypeman_exec_bytes_sent_total` | counter | | Bytes to guest (stdin) | +| `hypeman_exec_bytes_received_total` | counter | | Bytes from guest (stdout+stderr) | + +## Usage + +```go +provider, shutdown, err := otel.Init(ctx, otel.Config{ + Enabled: true, + Endpoint: "localhost:4317", + ServiceName: "hypeman", +}) +defer shutdown(ctx) + +meter := provider.Meter // Use for creating metrics +``` + diff --git a/lib/otel/metrics.go b/lib/otel/metrics.go new file mode 100644 index 00000000..6532703d --- /dev/null +++ b/lib/otel/metrics.go @@ -0,0 +1,317 @@ +package otel + +import ( + "go.opentelemetry.io/otel/metric" +) + +// ImageMetrics holds metrics for the image manager. +type ImageMetrics struct { + BuildQueueLength metric.Int64ObservableGauge + BuildDuration metric.Float64Histogram + ImagesTotal metric.Int64ObservableGauge + PullsTotal metric.Int64Counter +} + +// NewImageMetrics creates metrics for the image manager. +func NewImageMetrics(meter metric.Meter) (*ImageMetrics, error) { + buildQueueLength, err := meter.Int64ObservableGauge( + "hypeman_images_build_queue_length", + metric.WithDescription("Current number of images in the build queue"), + ) + if err != nil { + return nil, err + } + + buildDuration, err := meter.Float64Histogram( + "hypeman_images_build_duration_seconds", + metric.WithDescription("Time to build an image"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + imagesTotal, err := meter.Int64ObservableGauge( + "hypeman_images_total", + metric.WithDescription("Total number of cached images"), + ) + if err != nil { + return nil, err + } + + pullsTotal, err := meter.Int64Counter( + "hypeman_images_pulls_total", + metric.WithDescription("Total number of image pulls from registries"), + ) + if err != nil { + return nil, err + } + + return &ImageMetrics{ + BuildQueueLength: buildQueueLength, + BuildDuration: buildDuration, + ImagesTotal: imagesTotal, + PullsTotal: pullsTotal, + }, nil +} + +// InstanceMetrics holds metrics for the instance manager. +type InstanceMetrics struct { + InstancesTotal metric.Int64ObservableGauge + CreateDuration metric.Float64Histogram + RestoreDuration metric.Float64Histogram + StandbyDuration metric.Float64Histogram + StateTransitions metric.Int64Counter +} + +// NewInstanceMetrics creates metrics for the instance manager. +func NewInstanceMetrics(meter metric.Meter) (*InstanceMetrics, error) { + instancesTotal, err := meter.Int64ObservableGauge( + "hypeman_instances_total", + metric.WithDescription("Total number of instances by state"), + ) + if err != nil { + return nil, err + } + + createDuration, err := meter.Float64Histogram( + "hypeman_instances_create_duration_seconds", + metric.WithDescription("Time to create an instance"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + restoreDuration, err := meter.Float64Histogram( + "hypeman_instances_restore_duration_seconds", + metric.WithDescription("Time to restore an instance from standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + standbyDuration, err := meter.Float64Histogram( + "hypeman_instances_standby_duration_seconds", + metric.WithDescription("Time to put an instance in standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + stateTransitions, err := meter.Int64Counter( + "hypeman_instances_state_transitions_total", + metric.WithDescription("Total number of instance state transitions"), + ) + if err != nil { + return nil, err + } + + return &InstanceMetrics{ + InstancesTotal: instancesTotal, + CreateDuration: createDuration, + RestoreDuration: restoreDuration, + StandbyDuration: standbyDuration, + StateTransitions: stateTransitions, + }, nil +} + +// NetworkMetrics holds metrics for the network manager. +type NetworkMetrics struct { + AllocationsTotal metric.Int64ObservableGauge + TapOperations metric.Int64Counter +} + +// NewNetworkMetrics creates metrics for the network manager. +func NewNetworkMetrics(meter metric.Meter) (*NetworkMetrics, error) { + allocationsTotal, err := meter.Int64ObservableGauge( + "hypeman_network_allocations_total", + metric.WithDescription("Total number of active network allocations"), + ) + if err != nil { + return nil, err + } + + tapOperations, err := meter.Int64Counter( + "hypeman_network_tap_operations_total", + metric.WithDescription("Total number of TAP device operations"), + ) + if err != nil { + return nil, err + } + + return &NetworkMetrics{ + AllocationsTotal: allocationsTotal, + TapOperations: tapOperations, + }, nil +} + +// VolumeMetrics holds metrics for the volume manager. +type VolumeMetrics struct { + VolumesTotal metric.Int64ObservableGauge + AllocatedBytes metric.Int64ObservableGauge + UsedBytes metric.Int64ObservableGauge + CreateDuration metric.Float64Histogram +} + +// NewVolumeMetrics creates metrics for the volume manager. +func NewVolumeMetrics(meter metric.Meter) (*VolumeMetrics, error) { + volumesTotal, err := meter.Int64ObservableGauge( + "hypeman_volumes_total", + metric.WithDescription("Total number of volumes"), + ) + if err != nil { + return nil, err + } + + allocatedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_allocated_bytes", + metric.WithDescription("Total allocated/provisioned volume size in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + usedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_used_bytes", + metric.WithDescription("Actual disk space consumed by volumes in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + createDuration, err := meter.Float64Histogram( + "hypeman_volumes_create_duration_seconds", + metric.WithDescription("Time to create a volume"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + return &VolumeMetrics{ + VolumesTotal: volumesTotal, + AllocatedBytes: allocatedBytes, + UsedBytes: usedBytes, + CreateDuration: createDuration, + }, nil +} + +// ExecMetrics holds metrics for the exec client. +type ExecMetrics struct { + SessionsTotal metric.Int64Counter + Duration metric.Float64Histogram + BytesSentTotal metric.Int64Counter + BytesReceivedTotal metric.Int64Counter +} + +// NewExecMetrics creates metrics for the exec client. +func NewExecMetrics(meter metric.Meter) (*ExecMetrics, error) { + sessionsTotal, err := meter.Int64Counter( + "hypeman_exec_sessions_total", + metric.WithDescription("Total number of exec sessions"), + ) + if err != nil { + return nil, err + } + + duration, err := meter.Float64Histogram( + "hypeman_exec_duration_seconds", + metric.WithDescription("Exec command duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + bytesSentTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_sent_total", + metric.WithDescription("Total bytes sent to guest (stdin)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + bytesReceivedTotal, err := meter.Int64Counter( + "hypeman_exec_bytes_received_total", + metric.WithDescription("Total bytes received from guest (stdout+stderr)"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + return &ExecMetrics{ + SessionsTotal: sessionsTotal, + Duration: duration, + BytesSentTotal: bytesSentTotal, + BytesReceivedTotal: bytesReceivedTotal, + }, nil +} + +// VMMMetrics holds metrics for the VMM client. +type VMMMetrics struct { + APIDuration metric.Float64Histogram + APIErrorsTotal metric.Int64Counter +} + +// NewVMMMetrics creates metrics for the VMM client. +func NewVMMMetrics(meter metric.Meter) (*VMMMetrics, error) { + apiDuration, err := meter.Float64Histogram( + "hypeman_vmm_api_duration_seconds", + metric.WithDescription("Cloud Hypervisor API call duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + apiErrorsTotal, err := meter.Int64Counter( + "hypeman_vmm_api_errors_total", + metric.WithDescription("Total number of Cloud Hypervisor API errors"), + ) + if err != nil { + return nil, err + } + + return &VMMMetrics{ + APIDuration: apiDuration, + APIErrorsTotal: apiErrorsTotal, + }, nil +} + +// HTTPMetrics holds metrics for HTTP middleware. +type HTTPMetrics struct { + RequestsTotal metric.Int64Counter + RequestDuration metric.Float64Histogram +} + +// NewHTTPMetrics creates metrics for HTTP middleware. +func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) { + requestsTotal, err := meter.Int64Counter( + "hypeman_http_requests_total", + metric.WithDescription("Total number of HTTP requests"), + ) + if err != nil { + return nil, err + } + + requestDuration, err := meter.Float64Histogram( + "hypeman_http_request_duration_seconds", + metric.WithDescription("HTTP request duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + return &HTTPMetrics{ + RequestsTotal: requestsTotal, + RequestDuration: requestDuration, + }, nil +} diff --git a/lib/otel/otel.go b/lib/otel/otel.go new file mode 100644 index 00000000..7c3e8f20 --- /dev/null +++ b/lib/otel/otel.go @@ -0,0 +1,214 @@ +// Package otel provides OpenTelemetry initialization and configuration. +package otel + +import ( + "context" + "fmt" + goruntime "runtime" + "time" + + otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.24.0" + "go.opentelemetry.io/otel/trace" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +// Config holds OpenTelemetry configuration. +type Config struct { + Enabled bool + Endpoint string + ServiceName string + Insecure bool + Version string +} + +// Provider holds initialized OTel providers. +type Provider struct { + TracerProvider *sdktrace.TracerProvider + MeterProvider *sdkmetric.MeterProvider + Tracer trace.Tracer + Meter metric.Meter + startTime time.Time +} + +// Init initializes OpenTelemetry with the given configuration. +// Returns a shutdown function that should be called on application exit. +// If OTel is disabled, returns a no-op shutdown function. +func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) error, error) { + if !cfg.Enabled { + // Return no-op provider when disabled + return &Provider{ + Tracer: otel.Tracer(cfg.ServiceName), + Meter: otel.Meter(cfg.ServiceName), + startTime: time.Now(), + }, func(context.Context) error { return nil }, nil + } + + // Create resource with service information + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.Version), + ), + ) + if err != nil { + return nil, nil, fmt.Errorf("create resource: %w", err) + } + + // Setup gRPC connection options + var dialOpts []grpc.DialOption + if cfg.Insecure { + dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials())) + } + + // Create trace exporter + traceExporter, err := otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(cfg.Endpoint), + otlptracegrpc.WithDialOption(dialOpts...), + ) + if err != nil { + return nil, nil, fmt.Errorf("create trace exporter: %w", err) + } + + // Create tracer provider + tracerProvider := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(traceExporter), + sdktrace.WithResource(res), + ) + + // Create metric exporter + metricExporter, err := otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(cfg.Endpoint), + otlpmetricgrpc.WithDialOption(dialOpts...), + ) + if err != nil { + tracerProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("create metric exporter: %w", err) + } + + // Create meter provider + meterProvider := sdkmetric.NewMeterProvider( + sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExporter)), + sdkmetric.WithResource(res), + ) + + // Set global providers + otel.SetTracerProvider(tracerProvider) + otel.SetMeterProvider(meterProvider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + // Start runtime metrics collection + if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("start runtime metrics: %w", err) + } + + provider := &Provider{ + TracerProvider: tracerProvider, + MeterProvider: meterProvider, + Tracer: tracerProvider.Tracer(cfg.ServiceName), + Meter: meterProvider.Meter(cfg.ServiceName), + startTime: time.Now(), + } + + // Register system metrics (uptime, info) + if err := provider.registerSystemMetrics(cfg); err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("register system metrics: %w", err) + } + + shutdown := func(ctx context.Context) error { + var errs []error + if err := tracerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown tracer: %w", err)) + } + if err := meterProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown meter: %w", err)) + } + if len(errs) > 0 { + return fmt.Errorf("shutdown errors: %v", errs) + } + return nil + } + + return provider, shutdown, nil +} + +// registerSystemMetrics registers uptime and info metrics. +func (p *Provider) registerSystemMetrics(cfg Config) error { + // Uptime gauge + uptime, err := p.Meter.Float64ObservableGauge( + "hypeman_uptime_seconds", + metric.WithDescription("Process uptime in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + return fmt.Errorf("create uptime gauge: %w", err) + } + + // Info gauge (always 1, with version labels) + info, err := p.Meter.Int64ObservableGauge( + "hypeman_info", + metric.WithDescription("Hypeman build information"), + ) + if err != nil { + return fmt.Errorf("create info gauge: %w", err) + } + + _, err = p.Meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + o.ObserveFloat64(uptime, time.Since(p.startTime).Seconds()) + o.ObserveInt64(info, 1, + metric.WithAttributes( + semconv.ServiceVersion(cfg.Version), + semconv.TelemetrySDKLanguageGo, + ), + ) + return nil + }, + uptime, + info, + ) + if err != nil { + return fmt.Errorf("register callback: %w", err) + } + + return nil +} + +// Tracer returns a tracer for the given subsystem. +func (p *Provider) TracerFor(subsystem string) trace.Tracer { + if p.TracerProvider != nil { + return p.TracerProvider.Tracer(subsystem) + } + return otel.Tracer(subsystem) +} + +// Meter returns a meter for the given subsystem. +func (p *Provider) MeterFor(subsystem string) metric.Meter { + if p.MeterProvider != nil { + return p.MeterProvider.Meter(subsystem) + } + return otel.Meter(subsystem) +} + +// GoVersion returns the Go version used to build the binary. +func GoVersion() string { + return goruntime.Version() +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 2daa0749..fa9d0b7c 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log/slog" - "os" "github.com/c2h5oh/datasize" "github.com/onkernel/hypeman/cmd/api/config" @@ -17,11 +16,10 @@ import ( "github.com/onkernel/hypeman/lib/volumes" ) -// ProvideLogger provides a structured logger +// ProvideLogger provides a structured logger with subsystem-specific levels func ProvideLogger() *slog.Logger { - return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ - Level: slog.LevelInfo, - })) + cfg := logger.NewConfig() + return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg) } // ProvideContext provides a context with logger attached @@ -41,7 +39,7 @@ func ProvidePaths(cfg *config.Config) *paths.Paths { // ProvideImageManager provides the image manager func ProvideImageManager(p *paths.Paths, cfg *config.Config) (images.Manager, error) { - return images.NewManager(p, cfg.MaxConcurrentBuilds) + return images.NewManager(p, cfg.MaxConcurrentBuilds, nil) } // ProvideSystemManager provides the system manager @@ -51,7 +49,7 @@ func ProvideSystemManager(p *paths.Paths) system.Manager { // ProvideNetworkManager provides the network manager func ProvideNetworkManager(p *paths.Paths, cfg *config.Config) network.Manager { - return network.NewManager(p, cfg) + return network.NewManager(p, cfg, nil) } // ProvideInstanceManager provides the instance manager @@ -90,7 +88,7 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima MaxTotalMemory: maxTotalMemory, } - return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits), nil + return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil), nil } // ProvideVolumeManager provides the volume manager @@ -105,5 +103,5 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager, maxTotalVolumeStorage = int64(storageSize) } - return volumes.NewManager(p, maxTotalVolumeStorage), nil + return volumes.NewManager(p, maxTotalVolumeStorage, nil), nil } diff --git a/lib/vmm/metrics.go b/lib/vmm/metrics.go new file mode 100644 index 00000000..39ef4dd1 --- /dev/null +++ b/lib/vmm/metrics.go @@ -0,0 +1,66 @@ +package vmm + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for VMM operations. +type Metrics struct { + APIDuration metric.Float64Histogram + APIErrorsTotal metric.Int64Counter +} + +// NewMetrics creates VMM metrics instruments. +// If meter is nil, returns nil (metrics disabled). +func NewMetrics(meter metric.Meter) (*Metrics, error) { + if meter == nil { + return nil, nil + } + + apiDuration, err := meter.Float64Histogram( + "hypeman_vmm_api_duration_seconds", + metric.WithDescription("Cloud Hypervisor API call duration"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + apiErrorsTotal, err := meter.Int64Counter( + "hypeman_vmm_api_errors_total", + metric.WithDescription("Total number of Cloud Hypervisor API errors"), + ) + if err != nil { + return nil, err + } + + return &Metrics{ + APIDuration: apiDuration, + APIErrorsTotal: apiErrorsTotal, + }, nil +} + +// RecordAPICall records the duration and status of an API call. +func (m *Metrics) RecordAPICall(ctx context.Context, operation string, start time.Time, err error) { + if m == nil { + return + } + + duration := time.Since(start).Seconds() + status := "success" + if err != nil { + status = "error" + m.APIErrorsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("operation", operation))) + } + + m.APIDuration.Record(ctx, duration, + metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("status", status), + )) +} diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go index 2292e98c..ac5b375a 100644 --- a/lib/volumes/manager.go +++ b/lib/volumes/manager.go @@ -6,11 +6,14 @@ import ( "io" "os" "sync" + "syscall" "time" "github.com/nrednav/cuid2" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" ) // Manager provides volume lifecycle operations @@ -34,20 +37,126 @@ type Manager interface { GetVolumePath(id string) string } +// Metrics holds the metrics instruments for volume operations. +type Metrics struct { + createDuration metric.Float64Histogram +} + type manager struct { paths *paths.Paths maxTotalVolumeStorage int64 // Maximum total volume storage in bytes (0 = unlimited) volumeLocks sync.Map // map[string]*sync.RWMutex - per-volume locks + metrics *Metrics } -// NewManager creates a new volumes manager -// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited) -func NewManager(p *paths.Paths, maxTotalVolumeStorage int64) Manager { - return &manager{ +// NewManager creates a new volumes manager. +// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited). +// If meter is nil, metrics are disabled. +func NewManager(p *paths.Paths, maxTotalVolumeStorage int64, meter metric.Meter) Manager { + m := &manager{ paths: p, maxTotalVolumeStorage: maxTotalVolumeStorage, volumeLocks: sync.Map{}, } + + // Initialize metrics if meter is provided + if meter != nil { + metrics, err := newVolumeMetrics(meter, m) + if err == nil { + m.metrics = metrics + } + } + + return m +} + +// newVolumeMetrics creates and registers all volume metrics. +func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_volumes_create_duration_seconds", + metric.WithDescription("Time to create a volume"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges + volumesTotal, err := meter.Int64ObservableGauge( + "hypeman_volumes_total", + metric.WithDescription("Total number of volumes"), + ) + if err != nil { + return nil, err + } + + allocatedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_allocated_bytes", + metric.WithDescription("Total allocated/provisioned volume size in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + usedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_used_bytes", + metric.WithDescription("Actual disk space consumed by volumes in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + volumes, err := m.ListVolumes(ctx) + if err != nil { + return nil + } + + o.ObserveInt64(volumesTotal, int64(len(volumes))) + + var totalAllocated, totalUsed int64 + for _, vol := range volumes { + // Allocated = provisioned size in GB + totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024 + + // Used = actual disk blocks consumed (for sparse files) + diskPath := m.paths.VolumeData(vol.Id) + if stat, err := os.Stat(diskPath); err == nil { + // Get actual blocks used via syscall + if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok { + totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units + } + } + } + + o.ObserveInt64(allocatedBytes, totalAllocated) + o.ObserveInt64(usedBytes, totalUsed) + return nil + }, + volumesTotal, + allocatedBytes, + usedBytes, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + }, nil +} + +// recordCreateDuration records the volume creation duration. +func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.createDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) } // getVolumeLock returns or creates a lock for a specific volume @@ -92,6 +201,8 @@ func (m *manager) calculateTotalVolumeStorage(ctx context.Context) (int64, error // CreateVolume creates a new volume func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*Volume, error) { + start := time.Now() + // Generate or use provided ID id := cuid2.Generate() if req.Id != nil && *req.Id != "" { @@ -145,12 +256,15 @@ func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*V return nil, err } + m.recordCreateDuration(ctx, start, "success") return m.metadataToVolume(meta), nil } // CreateVolumeFromArchive creates a new volume pre-populated with content from a tar.gz archive. // The archive is safely extracted with size limits to prevent tar bombs. func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeFromArchiveRequest, archive io.Reader) (*Volume, error) { + start := time.Now() + // Generate or use provided ID id := cuid2.Generate() if req.Id != nil && *req.Id != "" { @@ -223,6 +337,7 @@ func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeF return nil, err } + m.recordCreateDuration(ctx, start, "success") return m.metadataToVolume(meta), nil } diff --git a/lib/volumes/manager_test.go b/lib/volumes/manager_test.go index 3752e230..130fb1c4 100644 --- a/lib/volumes/manager_test.go +++ b/lib/volumes/manager_test.go @@ -23,7 +23,7 @@ func setupTestManager(t *testing.T) (Manager, *paths.Paths, func()) { // Create required directories require.NoError(t, os.MkdirAll(p.VolumesDir(), 0755)) - manager := NewManager(p, 0) // 0 = unlimited storage + manager := NewManager(p, 0, nil) // 0 = unlimited storage cleanup := func() { os.RemoveAll(tmpDir) @@ -389,4 +389,3 @@ func TestMultiAttach_ConcurrentRWConflict(t *testing.T) { assert.Len(t, vol.Attachments, 1, "Should have exactly one attachment") assert.False(t, vol.Attachments[0].Readonly, "Attachment should be read-write") } - From 0ea3118e7ac4b08982910541cad68ecd7e3cd98f Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 13:16:40 -0500 Subject: [PATCH 02/14] Update top level README --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index dc7a3577..1bf3ff83 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,10 @@ Hypeman can be configured using the following environment variables: | `DNS_SERVER` | DNS server IP address for VMs | `1.1.1.1` | | `MAX_CONCURRENT_BUILDS` | Maximum number of concurrent image builds | `1` | | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` | +| `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` | +| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` | +| `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` | +| `LOG_LEVEL_` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default | **Important: Subnet Configuration** @@ -159,6 +163,24 @@ make dev ``` The server will start on port 8080 (configurable via `PORT` environment variable). +#### Local OpenTelemetry (optional) + +To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir): + +```bash +# Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin) +docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \ + grafana/otel-lgtm:latest + +# Enable OTel in .env +echo "OTEL_ENABLED=true" >> .env + +# Restart dev server +make dev +``` + +Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. + ### Testing Network tests require elevated permissions to create bridges and TAP devices. From 6d3f4cb8c62178a9bddf2798e83459206c2885ac Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 13:24:37 -0500 Subject: [PATCH 03/14] tweak global labels --- README.md | 7 +++-- cmd/api/config/config.go | 65 +++++++++++++++++++++++++++++++++------- cmd/api/main.go | 12 ++++---- lib/otel/README.md | 2 ++ lib/otel/otel.go | 14 +++++---- 5 files changed, 78 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 1bf3ff83..81f4a52b 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,10 @@ Hypeman can be configured using the following environment variables: | `DNS_SERVER` | DNS server IP address for VMs | `1.1.1.1` | | `MAX_CONCURRENT_BUILDS` | Maximum number of concurrent image builds | `1` | | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` | +| `ENV` | Deployment environment (filters telemetry, e.g. your name for dev) | `unset` | | `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` | | `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` | +| `OTEL_SERVICE_INSTANCE_ID` | Instance ID for telemetry (differentiates multiple servers) | hostname | | `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` | | `LOG_LEVEL_` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default | @@ -172,14 +174,15 @@ To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \ grafana/otel-lgtm:latest -# Enable OTel in .env +# Enable OTel in .env (set ENV to your name to filter your telemetry) echo "OTEL_ENABLED=true" >> .env +echo "ENV=yourname" >> .env # Restart dev server make dev ``` -Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. +Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. Filter by `deployment.environment="yourname"` to see only your data. ### Testing diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index ec810d7e..d04a627e 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -2,11 +2,52 @@ package config import ( "os" + "runtime/debug" "strconv" "github.com/joho/godotenv" ) +func getHostname() string { + if h, err := os.Hostname(); err == nil { + return h + } + return "unknown" +} + +// getBuildVersion extracts version info from Go's embedded build info. +// Returns git short hash + "-dirty" suffix if uncommitted changes, or "unknown" if unavailable. +func getBuildVersion() string { + info, ok := debug.ReadBuildInfo() + if !ok { + return "unknown" + } + + var revision string + var dirty bool + for _, s := range info.Settings { + switch s.Key { + case "vcs.revision": + revision = s.Value + case "vcs.modified": + dirty = s.Value == "true" + } + } + + if revision == "" { + return "unknown" + } + + // Use short hash (8 chars) + if len(revision) > 8 { + revision = revision[:8] + } + if dirty { + revision += "-dirty" + } + return revision +} + type Config struct { Port string DataDir string @@ -32,11 +73,13 @@ type Config struct { MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited) // OpenTelemetry configuration - OtelEnabled bool // Enable OpenTelemetry - OtelEndpoint string // OTLP endpoint (gRPC) - OtelServiceName string // Service name for tracing - OtelInsecure bool // Disable TLS for OTLP - Version string // Application version for telemetry + OtelEnabled bool // Enable OpenTelemetry + OtelEndpoint string // OTLP endpoint (gRPC) + OtelServiceName string // Service name for tracing + OtelServiceInstanceID string // Service instance ID (default: hostname) + OtelInsecure bool // Disable TLS for OTLP + Version string // Application version for telemetry + Env string // Deployment environment (e.g., dev, staging, prod) // Logging configuration LogLevel string // Default log level (debug, info, warn, error) @@ -73,11 +116,13 @@ func Load() *Config { MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""), // OpenTelemetry configuration - OtelEnabled: getEnvBool("OTEL_ENABLED", false), - OtelEndpoint: getEnv("OTEL_ENDPOINT", "localhost:4317"), - OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"), - OtelInsecure: getEnvBool("OTEL_INSECURE", true), - Version: getEnv("VERSION", "dev"), + OtelEnabled: getEnvBool("OTEL_ENABLED", false), + OtelEndpoint: getEnv("OTEL_ENDPOINT", "localhost:4317"), + OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"), + OtelServiceInstanceID: getEnv("OTEL_SERVICE_INSTANCE_ID", getHostname()), + OtelInsecure: getEnvBool("OTEL_INSECURE", true), + Version: getEnv("VERSION", getBuildVersion()), + Env: getEnv("ENV", "unset"), // Logging configuration LogLevel: getEnv("LOG_LEVEL", "info"), diff --git a/cmd/api/main.go b/cmd/api/main.go index 4ff139b1..d5ae60e1 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -43,11 +43,13 @@ func run() error { // Initialize OpenTelemetry (before wire initialization) otelCfg := otel.Config{ - Enabled: cfg.OtelEnabled, - Endpoint: cfg.OtelEndpoint, - ServiceName: cfg.OtelServiceName, - Insecure: cfg.OtelInsecure, - Version: cfg.Version, + Enabled: cfg.OtelEnabled, + Endpoint: cfg.OtelEndpoint, + ServiceName: cfg.OtelServiceName, + ServiceInstanceID: cfg.OtelServiceInstanceID, + Insecure: cfg.OtelInsecure, + Version: cfg.Version, + Env: cfg.Env, } otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg) diff --git a/lib/otel/README.md b/lib/otel/README.md index 0f0187a7..3c60c432 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -13,9 +13,11 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. | Variable | Description | Default | |----------|-------------|---------| +| `ENV` | Deployment environment (`deployment.environment` attribute) | `unset` | | `OTEL_ENABLED` | Enable OpenTelemetry | `false` | | `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` | | `OTEL_SERVICE_NAME` | Service name | `hypeman` | +| `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname | | `OTEL_INSECURE` | Disable TLS for OTLP | `true` | ## Metrics diff --git a/lib/otel/otel.go b/lib/otel/otel.go index 7c3e8f20..26ea1e4e 100644 --- a/lib/otel/otel.go +++ b/lib/otel/otel.go @@ -24,11 +24,13 @@ import ( // Config holds OpenTelemetry configuration. type Config struct { - Enabled bool - Endpoint string - ServiceName string - Insecure bool - Version string + Enabled bool + Endpoint string + ServiceName string + ServiceInstanceID string + Insecure bool + Version string + Env string } // Provider holds initialized OTel providers. @@ -60,6 +62,8 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err semconv.SchemaURL, semconv.ServiceName(cfg.ServiceName), semconv.ServiceVersion(cfg.Version), + semconv.ServiceInstanceID(cfg.ServiceInstanceID), + semconv.DeploymentEnvironment(cfg.Env), ), ) if err != nil { From e4772926e6d22c660e5afbf8213dadc897e879b5 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 14:10:24 -0500 Subject: [PATCH 04/14] Fix otel connection --- README.md | 21 ++- cmd/api/config/config.go | 2 +- dashboards/hypeman.json | 395 +++++++++++++++++++++++++++++++++++++++ lib/otel/README.md | 2 +- lib/otel/otel.go | 4 +- 5 files changed, 417 insertions(+), 7 deletions(-) create mode 100644 dashboards/hypeman.json diff --git a/README.md b/README.md index 81f4a52b..ae9712eb 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Hypeman can be configured using the following environment variables: | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` | | `ENV` | Deployment environment (filters telemetry, e.g. your name for dev) | `unset` | | `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` | -| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` | +| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `127.0.0.1:4317` | | `OTEL_SERVICE_INSTANCE_ID` | Instance ID for telemetry (differentiates multiple servers) | hostname | | `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` | | `LOG_LEVEL_` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default | @@ -171,9 +171,17 @@ To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana ```bash # Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin) -docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \ +# Note, if you are developing on a shared server, you can use the same LGTM stack as your peer(s) +# You will be able to sort your metrics, traces, and logs using the ENV configuration (see below) +docker run -d --name lgtm \ + -p 127.0.0.1:3000:3000 \ + -p 127.0.0.1:4317:4317 \ + -p 127.0.0.1:4318:4318 \ grafana/otel-lgtm:latest +# If developing on a remote server, forward the port to your local machine: +# ssh -L 3001:localhost:3000 your-server (then open http://localhost:3001) + # Enable OTel in .env (set ENV to your name to filter your telemetry) echo "OTEL_ENABLED=true" >> .env echo "ENV=yourname" >> .env @@ -182,7 +190,14 @@ echo "ENV=yourname" >> .env make dev ``` -Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. Filter by `deployment.environment="yourname"` to see only your data. +Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. + +**Import the Hypeman dashboard:** +1. Go to Dashboards → New → Import +2. Upload `dashboards/hypeman.json` or paste its contents +3. Select the Prometheus datasource and click Import + +Use the Environment/Instance dropdowns to filter by `deployment.environment` or `service.instance.id`. ### Testing diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index d04a627e..18db2007 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -117,7 +117,7 @@ func Load() *Config { // OpenTelemetry configuration OtelEnabled: getEnvBool("OTEL_ENABLED", false), - OtelEndpoint: getEnv("OTEL_ENDPOINT", "localhost:4317"), + OtelEndpoint: getEnv("OTEL_ENDPOINT", "127.0.0.1:4317"), OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"), OtelServiceInstanceID: getEnv("OTEL_SERVICE_INSTANCE_ID", getHostname()), OtelInsecure: getEnvBool("OTEL_INSECURE", true), diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json new file mode 100644 index 00000000..dc5fb605 --- /dev/null +++ b/dashboards/hypeman.json @@ -0,0 +1,395 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "gridPos": { "h": 3, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_uptime_seconds", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 3, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_instances_total", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "title": "Instances", + "type": "stat" + }, + { + "gridPos": { "h": 3, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_total", + "refId": "A" + } + ], + "title": "Volumes", + "type": "stat" + }, + { + "gridPos": { "h": 3, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_images_total", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Images", + "type": "stat" + }, + { + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, + "id": 5, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(http_server_request_duration_seconds_sum[5m]) / rate(http_server_request_duration_seconds_count[5m])", + "legendFormat": "{{http_route}}", + "refId": "A" + } + ], + "title": "HTTP Request Latency (avg)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(http_server_request_duration_seconds_count[5m])", + "legendFormat": "{{http_route}} {{http_status_code}}", + "refId": "A" + } + ], + "title": "HTTP Requests/sec", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "reqps" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 11 }, + "id": 7, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(hypeman_instances_create_duration_seconds_bucket[5m]))", + "legendFormat": "p95 create", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(hypeman_instances_restore_duration_seconds_bucket[5m]))", + "legendFormat": "p95 restore", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(hypeman_instances_standby_duration_seconds_bucket[5m]))", + "legendFormat": "p95 standby", + "refId": "C" + } + ], + "title": "Instance Operation Latency (p95)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 11 }, + "id": 8, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_allocated_bytes", + "legendFormat": "allocated", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_volumes_used_bytes", + "legendFormat": "used", + "refId": "B" + } + ], + "title": "Volume Storage", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "bytes" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 11 }, + "id": 9, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_exec_bytes_sent_total[5m])", + "legendFormat": "sent (stdin)", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_exec_bytes_received_total[5m])", + "legendFormat": "received (stdout+stderr)", + "refId": "B" + } + ], + "title": "Exec Throughput", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "Bps" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 19 }, + "id": 10, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_images_build_queue_length", + "legendFormat": "queue length", + "refId": "A" + } + ], + "title": "Image Build Queue", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 19 }, + "id": 11, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(hypeman_vmm_api_duration_seconds_bucket[5m]))", + "legendFormat": "p95 {{operation}}", + "refId": "A" + } + ], + "title": "VMM API Latency (p95)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 19 }, + "id": 12, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_network_allocations_total", + "legendFormat": "IP allocations", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_network_tap_operations_total[5m])", + "legendFormat": "TAP ops/sec ({{operation}})", + "refId": "B" + } + ], + "title": "Network", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 }, + "id": 13, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_memstats_heap_inuse_bytes", + "legendFormat": "heap in use", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_memstats_stack_inuse_bytes", + "legendFormat": "stack in use", + "refId": "B" + } + ], + "title": "Go Memory", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "bytes" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 }, + "id": 14, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_goroutines", + "legendFormat": "goroutines", + "refId": "A" + } + ], + "title": "Goroutines", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "tags": ["hypeman"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(hypeman_uptime_seconds, deployment_environment)", + "hide": 0, + "includeAll": true, + "label": "Environment", + "multi": false, + "name": "env", + "options": [], + "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment)", "refId": "A" }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(hypeman_uptime_seconds, service_instance_id)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": { "query": "label_values(hypeman_uptime_seconds, service_instance_id)", "refId": "A" }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Hypeman", + "uid": "hypeman-overview", + "version": 1, + "weekStart": "" +} + diff --git a/lib/otel/README.md b/lib/otel/README.md index 3c60c432..ff181ef3 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -15,7 +15,7 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. |----------|-------------|---------| | `ENV` | Deployment environment (`deployment.environment` attribute) | `unset` | | `OTEL_ENABLED` | Enable OpenTelemetry | `false` | -| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` | +| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `127.0.0.1:4317` | | `OTEL_SERVICE_NAME` | Service name | `hypeman` | | `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname | | `OTEL_INSECURE` | Disable TLS for OTLP | `true` | diff --git a/lib/otel/otel.go b/lib/otel/otel.go index 26ea1e4e..21f47af8 100644 --- a/lib/otel/otel.go +++ b/lib/otel/otel.go @@ -16,7 +16,7 @@ import ( sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" - semconv "go.opentelemetry.io/otel/semconv/v1.24.0" + semconv "go.opentelemetry.io/otel/semconv/v1.37.0" "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -63,7 +63,7 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err semconv.ServiceName(cfg.ServiceName), semconv.ServiceVersion(cfg.Version), semconv.ServiceInstanceID(cfg.ServiceInstanceID), - semconv.DeploymentEnvironment(cfg.Env), + semconv.DeploymentEnvironmentName(cfg.Env), ), ) if err != nil { From 3ce88c477cc2e97fc15e8d3c290bc697a815abba Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 14:12:38 -0500 Subject: [PATCH 05/14] Fix non-tls otel config --- lib/otel/otel.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/lib/otel/otel.go b/lib/otel/otel.go index 21f47af8..acac499a 100644 --- a/lib/otel/otel.go +++ b/lib/otel/otel.go @@ -18,8 +18,6 @@ import ( sdktrace "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.37.0" "go.opentelemetry.io/otel/trace" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" ) // Config holds OpenTelemetry configuration. @@ -70,17 +68,14 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err return nil, nil, fmt.Errorf("create resource: %w", err) } - // Setup gRPC connection options - var dialOpts []grpc.DialOption - if cfg.Insecure { - dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials())) - } - // Create trace exporter - traceExporter, err := otlptracegrpc.New(ctx, + traceOpts := []otlptracegrpc.Option{ otlptracegrpc.WithEndpoint(cfg.Endpoint), - otlptracegrpc.WithDialOption(dialOpts...), - ) + } + if cfg.Insecure { + traceOpts = append(traceOpts, otlptracegrpc.WithInsecure()) + } + traceExporter, err := otlptracegrpc.New(ctx, traceOpts...) if err != nil { return nil, nil, fmt.Errorf("create trace exporter: %w", err) } @@ -92,10 +87,13 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err ) // Create metric exporter - metricExporter, err := otlpmetricgrpc.New(ctx, + metricOpts := []otlpmetricgrpc.Option{ otlpmetricgrpc.WithEndpoint(cfg.Endpoint), - otlpmetricgrpc.WithDialOption(dialOpts...), - ) + } + if cfg.Insecure { + metricOpts = append(metricOpts, otlpmetricgrpc.WithInsecure()) + } + metricExporter, err := otlpmetricgrpc.New(ctx, metricOpts...) if err != nil { tracerProvider.Shutdown(ctx) return nil, nil, fmt.Errorf("create metric exporter: %w", err) From b461a031aaaea87b99dec86ed81f188b693a17b8 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 14:34:13 -0500 Subject: [PATCH 06/14] Fix dashboard --- README.md | 2 ++ dashboards/hypeman.json | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ae9712eb..ac2a048f 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,8 @@ docker run -d --name lgtm \ -p 127.0.0.1:3000:3000 \ -p 127.0.0.1:4317:4317 \ -p 127.0.0.1:4318:4318 \ + -p 127.0.0.1:9090:9090 \ + -p 127.0.0.1:4040:4040 \ grafana/otel-lgtm:latest # If developing on a remote server, forward the port to your local machine: diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json index dc5fb605..31c2d3f0 100644 --- a/dashboards/hypeman.json +++ b/dashboards/hypeman.json @@ -108,8 +108,8 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(http_server_request_duration_seconds_sum[5m]) / rate(http_server_request_duration_seconds_count[5m])", - "legendFormat": "{{http_route}}", + "expr": "rate(hypeman_http_request_duration_seconds_sum[5m]) / rate(hypeman_http_request_duration_seconds_count[5m])", + "legendFormat": "{{http_request_method}} {{http_route}}", "refId": "A" } ], @@ -130,8 +130,8 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(http_server_request_duration_seconds_count[5m])", - "legendFormat": "{{http_route}} {{http_status_code}}", + "expr": "rate(hypeman_http_requests_total[5m])", + "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}", "refId": "A" } ], @@ -306,14 +306,14 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_memstats_heap_inuse_bytes", - "legendFormat": "heap in use", + "expr": "go_memory_used_bytes", + "legendFormat": "memory used", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_memstats_stack_inuse_bytes", - "legendFormat": "stack in use", + "expr": "go_memory_gc_goal_bytes", + "legendFormat": "GC goal", "refId": "B" } ], @@ -334,7 +334,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_goroutines", + "expr": "go_goroutine_count", "legendFormat": "goroutines", "refId": "A" } @@ -351,14 +351,14 @@ { "current": { "selected": false, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(hypeman_uptime_seconds, deployment_environment)", + "definition": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "hide": 0, "includeAll": true, "label": "Environment", "multi": false, "name": "env", "options": [], - "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment)", "refId": "A" }, + "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "refId": "A" }, "refresh": 1, "regex": "", "skipUrlSync": false, @@ -392,4 +392,3 @@ "version": 1, "weekStart": "" } - From d4dffc2c929adf2785629bf6cd05cfb6bed4e4c4 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 15:10:29 -0500 Subject: [PATCH 07/14] Fix logs --- cmd/api/main.go | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/cmd/api/main.go b/cmd/api/main.go index d5ae60e1..f8e91a37 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -9,6 +9,7 @@ import ( "net/http" "os" "os/signal" + "strings" "syscall" "time" @@ -130,12 +131,8 @@ func run() error { // Create router r := chi.NewRouter() - // Add OpenTelemetry tracing middleware if enabled - if cfg.OtelEnabled { - r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r))) - } - - // Add HTTP metrics middleware if OTel is enabled + // Prepare HTTP metrics middleware (applied inside API group, not globally) + // Global application breaks WebSocket (Hijacker) and SSE (Flusher) var httpMetricsMw func(http.Handler) http.Handler if otelProvider != nil && otelProvider.Meter != nil { httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter) @@ -143,10 +140,6 @@ func run() error { httpMetricsMw = httpMetrics.Middleware } } - if httpMetricsMw == nil { - httpMetricsMw = mw.NoopHTTPMetrics() - } - r.Use(httpMetricsMw) // Load OpenAPI spec for request validation spec, err := oapi.GetSwagger() @@ -174,6 +167,24 @@ func run() error { r.Use(middleware.RealIP) r.Use(middleware.Logger) r.Use(middleware.Recoverer) + + // OpenTelemetry middleware (inside group to avoid breaking WebSocket/SSE on other routes) + if cfg.OtelEnabled { + r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r))) + } + if httpMetricsMw != nil { + // Skip HTTP metrics for SSE streaming endpoints (logs) + r.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/logs") { + next.ServeHTTP(w, r) + return + } + httpMetricsMw(next).ServeHTTP(w, r) + }) + }) + } + r.Use(middleware.Timeout(60 * time.Second)) // OpenAPI request validation with authentication From c287f6d6a3739e5409591314d04e547dd15fa598 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 15:33:03 -0500 Subject: [PATCH 08/14] Fix all metrics --- cmd/api/main.go | 7 +- dashboards/hypeman.json | 130 ++++++++++++++++++++++++++++--------- lib/providers/providers.go | 14 ++-- lib/vmm/client.go | 60 +++++++++++++---- lib/vmm/metrics.go | 9 +++ 5 files changed, 171 insertions(+), 49 deletions(-) diff --git a/cmd/api/main.go b/cmd/api/main.go index f8e91a37..a50d4405 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -27,6 +27,7 @@ import ( mw "github.com/onkernel/hypeman/lib/middleware" "github.com/onkernel/hypeman/lib/oapi" "github.com/onkernel/hypeman/lib/otel" + "github.com/onkernel/hypeman/lib/vmm" "github.com/riandyrn/otelchi" "golang.org/x/sync/errgroup" ) @@ -68,12 +69,16 @@ func run() error { }() } - // Initialize exec metrics if OTel is enabled + // Initialize exec and vmm metrics if OTel is enabled if otelProvider != nil && otelProvider.Meter != nil { execMetrics, err := exec.NewMetrics(otelProvider.Meter) if err == nil { exec.SetMetrics(execMetrics) } + vmmMetrics, err := vmm.NewMetrics(otelProvider.Meter) + if err == nil { + vmm.SetMetrics(vmmMetrics) + } } // Initialize app with wire diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json index 31c2d3f0..d3b778e6 100644 --- a/dashboards/hypeman.json +++ b/dashboards/hypeman.json @@ -22,7 +22,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_uptime_seconds", + "expr": "hypeman_uptime_seconds{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "refId": "A" } ], @@ -47,7 +47,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_instances_total", + "expr": "sum(hypeman_instances_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (state)", "legendFormat": "{{state}}", "refId": "A" } @@ -69,7 +69,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_volumes_total", + "expr": "hypeman_volumes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "refId": "A" } ], @@ -90,7 +90,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_images_total", + "expr": "sum(hypeman_images_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (status)", "legendFormat": "{{status}}", "refId": "A" } @@ -108,7 +108,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_http_request_duration_seconds_sum[5m]) / rate(hypeman_http_request_duration_seconds_count[5m])", + "expr": "rate(hypeman_http_request_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m]) / rate(hypeman_http_request_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", "legendFormat": "{{http_request_method}} {{http_route}}", "refId": "A" } @@ -130,7 +130,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_http_requests_total[5m])", + "expr": "rate(hypeman_http_requests_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}", "refId": "A" } @@ -152,19 +152,19 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, rate(hypeman_instances_create_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", "legendFormat": "p95 create", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, rate(hypeman_instances_restore_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_restore_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", "legendFormat": "p95 restore", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, rate(hypeman_instances_standby_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_standby_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", "legendFormat": "p95 standby", "refId": "C" } @@ -186,13 +186,13 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_volumes_allocated_bytes", + "expr": "hypeman_volumes_allocated_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "allocated", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_volumes_used_bytes", + "expr": "hypeman_volumes_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "used", "refId": "B" } @@ -214,13 +214,13 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_exec_bytes_sent_total[5m])", + "expr": "rate(hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", "legendFormat": "sent (stdin)", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_exec_bytes_received_total[5m])", + "expr": "rate(hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", "legendFormat": "received (stdout+stderr)", "refId": "B" } @@ -242,12 +242,18 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_images_build_queue_length", + "expr": "hypeman_images_build_queue_length{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "queue length", "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "legendFormat": "pulls/sec ({{status}})", + "refId": "B" } ], - "title": "Image Build Queue", + "title": "Image Build Queue & Pulls", "type": "timeseries" }, { @@ -260,7 +266,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, rate(hypeman_vmm_api_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(hypeman_vmm_api_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le, operation))", "legendFormat": "p95 {{operation}}", "refId": "A" } @@ -282,13 +288,13 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_network_allocations_total", + "expr": "hypeman_network_allocations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "IP allocations", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_network_tap_operations_total[5m])", + "expr": "rate(hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", "legendFormat": "TAP ops/sec ({{operation}})", "refId": "B" } @@ -297,7 +303,53 @@ "type": "timeseries" }, { - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }, + "id": 15, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(hypeman_images_build_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", + "legendFormat": "p95 build", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(hypeman_volumes_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", + "legendFormat": "p95 volume create", + "refId": "B" + } + ], + "title": "Image Build & Volume Create (p95)", + "type": "timeseries", + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + } + }, + { + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }, + "id": 16, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "legendFormat": "{{from}} → {{to}}", + "refId": "A" + } + ], + "title": "Instance State Transitions/sec", + "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }, "id": 13, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, @@ -306,13 +358,13 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_memory_used_bytes", + "expr": "go_memory_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "memory used", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_memory_gc_goal_bytes", + "expr": "go_memory_gc_goal_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "GC goal", "refId": "B" } @@ -325,7 +377,7 @@ } }, { - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 }, "id": 14, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, @@ -334,13 +386,31 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "go_goroutine_count", + "expr": "go_goroutine_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "goroutines", "refId": "A" } ], "title": "Goroutines", "type": "timeseries" + }, + { + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 }, + "id": 17, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "legendFormat": "sessions/sec ({{status}})", + "refId": "A" + } + ], + "title": "Exec Sessions/sec", + "type": "timeseries" } ], "refresh": "10s", @@ -355,14 +425,14 @@ "hide": 0, "includeAll": true, "label": "Environment", - "multi": false, + "multi": true, "name": "env", "options": [], "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "refId": "A" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "type": "query" }, { @@ -372,14 +442,14 @@ "hide": 0, "includeAll": true, "label": "Instance", - "multi": false, + "multi": true, "name": "instance", "options": [], "query": { "query": "label_values(hypeman_uptime_seconds, service_instance_id)", "refId": "A" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "type": "query" } ] @@ -389,6 +459,6 @@ "timezone": "", "title": "Hypeman", "uid": "hypeman-overview", - "version": 1, + "version": 2, "weekStart": "" } diff --git a/lib/providers/providers.go b/lib/providers/providers.go index fa9d0b7c..93713d13 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -14,6 +14,7 @@ import ( "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel" ) // ProvideLogger provides a structured logger with subsystem-specific levels @@ -39,7 +40,8 @@ func ProvidePaths(cfg *config.Config) *paths.Paths { // ProvideImageManager provides the image manager func ProvideImageManager(p *paths.Paths, cfg *config.Config) (images.Manager, error) { - return images.NewManager(p, cfg.MaxConcurrentBuilds, nil) + meter := otel.GetMeterProvider().Meter("hypeman") + return images.NewManager(p, cfg.MaxConcurrentBuilds, meter) } // ProvideSystemManager provides the system manager @@ -49,7 +51,8 @@ func ProvideSystemManager(p *paths.Paths) system.Manager { // ProvideNetworkManager provides the network manager func ProvideNetworkManager(p *paths.Paths, cfg *config.Config) network.Manager { - return network.NewManager(p, cfg, nil) + meter := otel.GetMeterProvider().Meter("hypeman") + return network.NewManager(p, cfg, meter) } // ProvideInstanceManager provides the instance manager @@ -88,7 +91,9 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima MaxTotalMemory: maxTotalMemory, } - return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil), nil + meter := otel.GetMeterProvider().Meter("hypeman") + tracer := otel.GetTracerProvider().Tracer("hypeman") + return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, meter, tracer), nil } // ProvideVolumeManager provides the volume manager @@ -103,5 +108,6 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager, maxTotalVolumeStorage = int64(storageSize) } - return volumes.NewManager(p, maxTotalVolumeStorage, nil), nil + meter := otel.GetMeterProvider().Meter("hypeman") + return volumes.NewManager(p, maxTotalVolumeStorage, meter), nil } diff --git a/lib/vmm/client.go b/lib/vmm/client.go index c70a6572..7fc7a906 100644 --- a/lib/vmm/client.go +++ b/lib/vmm/client.go @@ -12,6 +12,8 @@ import ( "time" "github.com/onkernel/hypeman/lib/paths" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" ) // VMM wraps the generated Cloud Hypervisor client (API v0.3.0) @@ -20,15 +22,45 @@ type VMM struct { socketPath string } +// metricsRoundTripper wraps an http.RoundTripper to record metrics +type metricsRoundTripper struct { + base http.RoundTripper +} + +func (m *metricsRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + start := time.Now() + resp, err := m.base.RoundTrip(req) + + // Record metrics using global VMMMetrics + if VMMMetrics != nil { + operation := req.Method + " " + req.URL.Path + status := "success" + if err != nil || (resp != nil && resp.StatusCode >= 400) { + status = "error" + VMMMetrics.APIErrorsTotal.Add(req.Context(), 1, + metric.WithAttributes(attribute.String("operation", operation))) + } + VMMMetrics.APIDuration.Record(req.Context(), time.Since(start).Seconds(), + metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("status", status), + )) + } + + return resp, err +} + // NewVMM creates a Cloud Hypervisor client for an existing VMM socket func NewVMM(socketPath string) (*VMM, error) { - httpClient := &http.Client{ - Transport: &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - return net.Dial("unix", socketPath) - }, + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", socketPath) }, - Timeout: 30 * time.Second, + } + + httpClient := &http.Client{ + Transport: &metricsRoundTripper{base: transport}, + Timeout: 30 * time.Second, } client, err := NewClientWithResponses("http://localhost/api/v1", @@ -71,15 +103,15 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion // Build command arguments args := []string{"--api-socket", socketPath} args = append(args, extraArgs...) - + // Use Command (not CommandContext) so process survives parent context cancellation cmd := exec.Command(binaryPath, args...) - + // Daemonize: detach from parent process group cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, // Create new process group } - + // Redirect stdout/stderr to log files (process won't block on I/O) instanceDir := filepath.Dir(socketPath) stdoutFile, err := os.OpenFile( @@ -94,7 +126,7 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion // The child process receives duplicated file descriptors during fork/exec, // so it can continue writing to the log files even after we close them here. defer stdoutFile.Close() - + stderrFile, err := os.OpenFile( filepath.Join(instanceDir, "ch-stderr.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, @@ -104,24 +136,24 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion return 0, fmt.Errorf("create stderr log: %w", err) } defer stderrFile.Close() - + cmd.Stdout = stdoutFile cmd.Stderr = stderrFile if err := cmd.Start(); err != nil { return 0, fmt.Errorf("start cloud-hypervisor: %w", err) } - + pid := cmd.Process.Pid // Wait for socket to be ready (use fresh context with timeout, not parent context) waitCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() - + if err := waitForSocket(waitCtx, socketPath, 5*time.Second); err != nil { return 0, err } - + return pid, nil } diff --git a/lib/vmm/metrics.go b/lib/vmm/metrics.go index 39ef4dd1..19b514cf 100644 --- a/lib/vmm/metrics.go +++ b/lib/vmm/metrics.go @@ -14,6 +14,15 @@ type Metrics struct { APIErrorsTotal metric.Int64Counter } +// VMMMetrics is the global metrics instance for the vmm package. +// Set this via SetMetrics() during application initialization. +var VMMMetrics *Metrics + +// SetMetrics sets the global metrics instance. +func SetMetrics(m *Metrics) { + VMMMetrics = m +} + // NewMetrics creates VMM metrics instruments. // If meter is nil, returns nil (metrics disabled). func NewMetrics(meter metric.Meter) (*Metrics, error) { From 774c9055a2aa4b54313e3d08a2612da9f8c11767 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 15:37:48 -0500 Subject: [PATCH 09/14] Fix queries --- dashboards/hypeman.json | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json index d3b778e6..652742fd 100644 --- a/dashboards/hypeman.json +++ b/dashboards/hypeman.json @@ -152,24 +152,24 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", - "legendFormat": "p95 create", + "expr": "hypeman_instances_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg create", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_restore_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", - "legendFormat": "p95 restore", + "expr": "hypeman_instances_restore_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_restore_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg restore", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_standby_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", - "legendFormat": "p95 standby", + "expr": "hypeman_instances_standby_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_standby_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg standby", "refId": "C" } ], - "title": "Instance Operation Latency (p95)", + "title": "Instance Operation Latency (avg)", "type": "timeseries", "fieldConfig": { "defaults": { "unit": "s" }, @@ -266,12 +266,12 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_vmm_api_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le, operation))", - "legendFormat": "p95 {{operation}}", + "expr": "hypeman_vmm_api_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_vmm_api_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg {{operation}}", "refId": "A" } ], - "title": "VMM API Latency (p95)", + "title": "VMM API Latency (avg)", "type": "timeseries", "fieldConfig": { "defaults": { "unit": "s" }, @@ -312,18 +312,18 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_images_build_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", - "legendFormat": "p95 build", + "expr": "hypeman_images_build_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_images_build_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg build", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(hypeman_volumes_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))", - "legendFormat": "p95 volume create", + "expr": "hypeman_volumes_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_volumes_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg volume create", "refId": "B" } ], - "title": "Image Build & Volume Create (p95)", + "title": "Image Build & Volume Create (avg)", "type": "timeseries", "fieldConfig": { "defaults": { "unit": "s" }, From b902b731feb39627d35fa554b836729754ce333c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 15:40:39 -0500 Subject: [PATCH 10/14] Fix exec queries --- dashboards/hypeman.json | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json index 652742fd..c2c46c55 100644 --- a/dashboards/hypeman.json +++ b/dashboards/hypeman.json @@ -214,21 +214,21 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "expr": "hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "sent (stdin)", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "expr": "hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "received (stdout+stderr)", "refId": "B" } ], - "title": "Exec Throughput", + "title": "Exec Bytes (cumulative)", "type": "timeseries", "fieldConfig": { - "defaults": { "unit": "Bps" }, + "defaults": { "unit": "bytes" }, "overrides": [] } }, @@ -248,8 +248,8 @@ }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", - "legendFormat": "pulls/sec ({{status}})", + "expr": "hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "pulls ({{status}})", "refId": "B" } ], @@ -294,8 +294,8 @@ }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", - "legendFormat": "TAP ops/sec ({{operation}})", + "expr": "hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "TAP ops ({{operation}})", "refId": "B" } ], @@ -340,12 +340,12 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", + "expr": "hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", "legendFormat": "{{from}} → {{to}}", "refId": "A" } ], - "title": "Instance State Transitions/sec", + "title": "Instance State Transitions", "type": "timeseries" }, { @@ -404,12 +404,18 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])", - "legendFormat": "sessions/sec ({{status}})", + "expr": "hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "sessions ({{status}}, exit={{exit_code}})", "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "hypeman_exec_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_exec_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "legendFormat": "avg duration (s)", + "refId": "B" } ], - "title": "Exec Sessions/sec", + "title": "Exec Sessions & Duration", "type": "timeseries" } ], From c9613d11d438e64d84c086b00436e9f178bc1ef0 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 15:47:40 -0500 Subject: [PATCH 11/14] Add otel export of logs --- README.md | 2 +- cmd/api/main.go | 5 +++ go.mod | 4 +++ go.sum | 8 +++++ lib/logger/logger.go | 74 +++++++++++++++++++++++++++++++++++--- lib/otel/README.md | 14 ++++++-- lib/otel/otel.go | 49 +++++++++++++++++++++++++ lib/providers/providers.go | 4 ++- 8 files changed, 152 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ac2a048f..5624ea6c 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,7 @@ echo "ENV=yourname" >> .env make dev ``` -Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. +Open http://localhost:3000 to view traces (Tempo), metrics (Mimir), and logs (Loki) in Grafana. **Import the Hypeman dashboard:** 1. Go to Dashboards → New → Import diff --git a/cmd/api/main.go b/cmd/api/main.go index a50d4405..59413ddf 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -81,6 +81,11 @@ func run() error { } } + // Set global OTel log handler for logger package + if otelProvider != nil && otelProvider.LogHandler != nil { + otel.SetGlobalLogHandler(otelProvider.LogHandler) + } + // Initialize app with wire app, cleanup, err := initializeApp() if err != nil { diff --git a/go.mod b/go.mod index 7c894df1..210a679a 100644 --- a/go.mod +++ b/go.mod @@ -88,7 +88,11 @@ require ( github.com/vishvananda/netns v0.0.5 // indirect github.com/woodsbury/decimal128 v1.3.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/log v0.14.0 // indirect + go.opentelemetry.io/otel/sdk/log v0.14.0 // indirect go.opentelemetry.io/proto/otlp v1.7.1 // indirect golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect diff --git a/go.sum b/go.sum index 91d5185a..e1226b84 100644 --- a/go.sum +++ b/go.sum @@ -198,20 +198,28 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 h1:bwnLpizECbPr1RrQ27waeY2SPIPeccCx/xLuoYADZ9s= +go.opentelemetry.io/contrib/bridges/otelslog v0.13.0/go.mod h1:3nWlOiiqA9UtUnrcNk82mYasNxD8ehOspL0gOfEo6Y4= go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ= go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 h1:OMqPldHt79PqWKOMYIAQs3CxAi7RLgPxwfFSwr4ZxtM= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0/go.mod h1:1biG4qiqTxKiUCtoWDPpL3fB3KxVwCiGw81j3nKMuHE= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/log v0.14.0 h1:2rzJ+pOAZ8qmZ3DDHg73NEKzSZkhkGIua9gXtxNGgrM= +go.opentelemetry.io/otel/log v0.14.0/go.mod h1:5jRG92fEAgx0SU/vFPxmJvhIuDU9E1SUnEQrMlJpOno= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/log v0.14.0 h1:JU/U3O7N6fsAXj0+CXz21Czg532dW2V4gG1HE/e8Zrg= +go.opentelemetry.io/otel/sdk/log v0.14.0/go.mod h1:imQvII+0ZylXfKU7/wtOND8Hn4OpT3YUoIgqJVksUkM= go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= diff --git a/lib/logger/logger.go b/lib/logger/logger.go index 23a69ef3..6d60fcde 100644 --- a/lib/logger/logger.go +++ b/lib/logger/logger.go @@ -101,16 +101,29 @@ func NewLogger(cfg Config) *slog.Logger { } // NewSubsystemLogger creates a logger for a specific subsystem with its configured level. -func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger { +// If otelHandler is provided, logs will be sent both to stdout and to OTel. +func NewSubsystemLogger(subsystem string, cfg Config, otelHandler slog.Handler) *slog.Logger { level := cfg.LevelFor(subsystem) - handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + jsonHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ Level: level, AddSource: cfg.AddSource, }) - // Wrap with trace context handler if we want trace IDs in logs + + var baseHandler slog.Handler + if otelHandler != nil { + // Use multi-handler to write to both stdout and OTel + baseHandler = &multiHandler{ + handlers: []slog.Handler{jsonHandler, otelHandler}, + } + } else { + baseHandler = jsonHandler + } + + // Wrap with trace context handler for trace IDs in logs wrappedHandler := &traceContextHandler{ - Handler: handler, + Handler: baseHandler, subsystem: subsystem, + level: level, } return slog.New(wrappedHandler) } @@ -119,6 +132,12 @@ func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger { type traceContextHandler struct { slog.Handler subsystem string + level slog.Level +} + +// Enabled reports whether the handler handles records at the given level. +func (h *traceContextHandler) Enabled(ctx context.Context, level slog.Level) bool { + return level >= h.level } // Handle adds trace_id and span_id from the context if available. @@ -143,6 +162,7 @@ func (h *traceContextHandler) WithAttrs(attrs []slog.Attr) slog.Handler { return &traceContextHandler{ Handler: h.Handler.WithAttrs(attrs), subsystem: h.subsystem, + level: h.level, } } @@ -151,7 +171,53 @@ func (h *traceContextHandler) WithGroup(name string) slog.Handler { return &traceContextHandler{ Handler: h.Handler.WithGroup(name), subsystem: h.subsystem, + level: h.level, + } +} + +// multiHandler fans out log records to multiple handlers. +type multiHandler struct { + handlers []slog.Handler +} + +// Enabled returns true if any handler is enabled. +func (m *multiHandler) Enabled(ctx context.Context, level slog.Level) bool { + for _, h := range m.handlers { + if h.Enabled(ctx, level) { + return true + } + } + return false +} + +// Handle writes the record to all handlers. +func (m *multiHandler) Handle(ctx context.Context, r slog.Record) error { + for _, h := range m.handlers { + if h.Enabled(ctx, r.Level) { + if err := h.Handle(ctx, r); err != nil { + return err + } + } + } + return nil +} + +// WithAttrs returns a new multiHandler with the given attributes. +func (m *multiHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + handlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + handlers[i] = h.WithAttrs(attrs) + } + return &multiHandler{handlers: handlers} +} + +// WithGroup returns a new multiHandler with the given group name. +func (m *multiHandler) WithGroup(name string) slog.Handler { + handlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + handlers[i] = h.WithGroup(name) } + return &multiHandler{handlers: handlers} } // AddToContext adds a logger to the context. diff --git a/lib/otel/README.md b/lib/otel/README.md index ff181ef3..6db7a686 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -4,9 +4,10 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. ## Features -- OTLP export for traces and metrics (gRPC) +- OTLP export for traces, metrics, and logs (gRPC) - Runtime metrics (Go GC, goroutines, memory) - Application-specific metrics per subsystem +- Log bridging from slog to OTel (viewable in Grafana/Loki) - Graceful degradation (failures don't crash the app) ## Configuration @@ -89,6 +90,15 @@ provider, shutdown, err := otel.Init(ctx, otel.Config{ }) defer shutdown(ctx) -meter := provider.Meter // Use for creating metrics +meter := provider.Meter // Use for creating metrics +tracer := provider.Tracer // Use for creating traces +logHandler := provider.LogHandler // Use with slog for logs to OTel ``` +## Logs + +Logs are exported via the OTel log bridge (`otelslog`). When OTel is enabled, all slog logs are sent to Loki (via OTLP) and include: +- `subsystem` attribute (API, IMAGES, INSTANCES, etc.) +- `trace_id` and `span_id` when available +- Service attributes (name, instance, environment) + diff --git a/lib/otel/otel.go b/lib/otel/otel.go index acac499a..2b1c2e41 100644 --- a/lib/otel/otel.go +++ b/lib/otel/otel.go @@ -4,15 +4,19 @@ package otel import ( "context" "fmt" + "log/slog" goruntime "runtime" "time" + "go.opentelemetry.io/contrib/bridges/otelslog" otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" + sdklog "go.opentelemetry.io/otel/sdk/log" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" @@ -35,8 +39,10 @@ type Config struct { type Provider struct { TracerProvider *sdktrace.TracerProvider MeterProvider *sdkmetric.MeterProvider + LoggerProvider *sdklog.LoggerProvider Tracer trace.Tracer Meter metric.Meter + LogHandler slog.Handler startTime time.Time } @@ -105,6 +111,26 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err sdkmetric.WithResource(res), ) + // Create log exporter + logOpts := []otlploggrpc.Option{ + otlploggrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + logOpts = append(logOpts, otlploggrpc.WithInsecure()) + } + logExporter, err := otlploggrpc.New(ctx, logOpts...) + if err != nil { + tracerProvider.Shutdown(ctx) + meterProvider.Shutdown(ctx) + return nil, nil, fmt.Errorf("create log exporter: %w", err) + } + + // Create logger provider + loggerProvider := sdklog.NewLoggerProvider( + sdklog.WithProcessor(sdklog.NewBatchProcessor(logExporter)), + sdklog.WithResource(res), + ) + // Set global providers otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) @@ -117,14 +143,20 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil { tracerProvider.Shutdown(ctx) meterProvider.Shutdown(ctx) + loggerProvider.Shutdown(ctx) return nil, nil, fmt.Errorf("start runtime metrics: %w", err) } + // Create slog handler that bridges to OTel + logHandler := otelslog.NewHandler(cfg.ServiceName, otelslog.WithLoggerProvider(loggerProvider)) + provider := &Provider{ TracerProvider: tracerProvider, MeterProvider: meterProvider, + LoggerProvider: loggerProvider, Tracer: tracerProvider.Tracer(cfg.ServiceName), Meter: meterProvider.Meter(cfg.ServiceName), + LogHandler: logHandler, startTime: time.Now(), } @@ -132,6 +164,7 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err if err := provider.registerSystemMetrics(cfg); err != nil { tracerProvider.Shutdown(ctx) meterProvider.Shutdown(ctx) + loggerProvider.Shutdown(ctx) return nil, nil, fmt.Errorf("register system metrics: %w", err) } @@ -143,6 +176,9 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err if err := meterProvider.Shutdown(ctx); err != nil { errs = append(errs, fmt.Errorf("shutdown meter: %w", err)) } + if err := loggerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown logger: %w", err)) + } if len(errs) > 0 { return fmt.Errorf("shutdown errors: %v", errs) } @@ -214,3 +250,16 @@ func (p *Provider) MeterFor(subsystem string) metric.Meter { func GoVersion() string { return goruntime.Version() } + +// globalLogHandler holds the OTel log handler for use by the logger package. +var globalLogHandler slog.Handler + +// SetGlobalLogHandler sets the global OTel log handler. +func SetGlobalLogHandler(h slog.Handler) { + globalLogHandler = h +} + +// GetGlobalLogHandler returns the global OTel log handler. +func GetGlobalLogHandler() slog.Handler { + return globalLogHandler +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 93713d13..010ee3e6 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -11,6 +11,7 @@ import ( "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" + hypemanotel "github.com/onkernel/hypeman/lib/otel" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" @@ -20,7 +21,8 @@ import ( // ProvideLogger provides a structured logger with subsystem-specific levels func ProvideLogger() *slog.Logger { cfg := logger.NewConfig() - return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg) + otelHandler := hypemanotel.GetGlobalLogHandler() + return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler) } // ProvideContext provides a context with logger attached From 0307c56281f52451d43aafee4f4ce1ea62691bf1 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 16:06:55 -0500 Subject: [PATCH 12/14] associate logs and traces --- cmd/api/api/images.go | 8 ++-- cmd/api/api/instances.go | 24 ++++++------ cmd/api/api/volumes.go | 16 ++++---- cmd/api/main.go | 19 ++++++++-- lib/middleware/otel.go | 79 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 28 deletions(-) diff --git a/cmd/api/api/images.go b/cmd/api/api/images.go index 913b9918..68f4cc96 100644 --- a/cmd/api/api/images.go +++ b/cmd/api/api/images.go @@ -14,7 +14,7 @@ func (s *ApiService) ListImages(ctx context.Context, request oapi.ListImagesRequ domainImages, err := s.ImageManager.ListImages(ctx) if err != nil { - log.Error("failed to list images", "error", err) + log.ErrorContext(ctx, "failed to list images", "error", err) return oapi.ListImages500JSONResponse{ Code: "internal_error", Message: "failed to list images", @@ -50,7 +50,7 @@ func (s *ApiService) CreateImage(ctx context.Context, request oapi.CreateImageRe Message: "image not found", }, nil default: - log.Error("failed to create image", "error", err, "name", request.Body.Name) + log.ErrorContext(ctx, "failed to create image", "error", err, "name", request.Body.Name) return oapi.CreateImage500JSONResponse{ Code: "internal_error", Message: "failed to create image", @@ -72,7 +72,7 @@ func (s *ApiService) GetImage(ctx context.Context, request oapi.GetImageRequestO Message: "image not found", }, nil default: - log.Error("failed to get image", "error", err, "name", request.Name) + log.ErrorContext(ctx, "failed to get image", "error", err, "name", request.Name) return oapi.GetImage500JSONResponse{ Code: "internal_error", Message: "failed to get image", @@ -94,7 +94,7 @@ func (s *ApiService) DeleteImage(ctx context.Context, request oapi.DeleteImageRe Message: "image not found", }, nil default: - log.Error("failed to delete image", "error", err, "name", request.Name) + log.ErrorContext(ctx, "failed to delete image", "error", err, "name", request.Name) return oapi.DeleteImage500JSONResponse{ Code: "internal_error", Message: "failed to delete image", diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go index 2d84ac08..12f5bd7b 100644 --- a/cmd/api/api/instances.go +++ b/cmd/api/api/instances.go @@ -21,18 +21,18 @@ func (s *ApiService) ListInstances(ctx context.Context, request oapi.ListInstanc domainInsts, err := s.InstanceManager.ListInstances(ctx) if err != nil { - log.Error("failed to list instances", "error", err) + log.ErrorContext(ctx, "failed to list instances", "error", err) return oapi.ListInstances500JSONResponse{ Code: "internal_error", Message: "failed to list instances", }, nil } - + oapiInsts := make([]oapi.Instance, len(domainInsts)) for i, inst := range domainInsts { oapiInsts[i] = instanceToOAPI(inst) } - + return oapi.ListInstances200JSONResponse(oapiInsts), nil } @@ -160,7 +160,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst Message: err.Error(), }, nil default: - log.Error("failed to create instance", "error", err, "image", request.Body.Image) + log.ErrorContext(ctx, "failed to create instance", "error", err, "image", request.Body.Image) return oapi.CreateInstance500JSONResponse{ Code: "internal_error", Message: "failed to create instance", @@ -195,7 +195,7 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe Message: "multiple instances have this name, use instance ID instead", }, nil default: - log.Error("failed to get instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to get instance", "error", err, "id", request.Id) return oapi.GetInstance500JSONResponse{ Code: "internal_error", Message: "failed to get instance", @@ -235,7 +235,7 @@ func (s *ApiService) DeleteInstance(ctx context.Context, request oapi.DeleteInst Message: "instance not found", }, nil default: - log.Error("failed to delete instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to delete instance", "error", err, "id", request.Id) return oapi.DeleteInstance500JSONResponse{ Code: "internal_error", Message: "failed to delete instance", @@ -280,7 +280,7 @@ func (s *ApiService) StandbyInstance(ctx context.Context, request oapi.StandbyIn Message: err.Error(), }, nil default: - log.Error("failed to standby instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to standby instance", "error", err, "id", request.Id) return oapi.StandbyInstance500JSONResponse{ Code: "internal_error", Message: "failed to standby instance", @@ -325,7 +325,7 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn Message: err.Error(), }, nil default: - log.Error("failed to restore instance", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to restore instance", "error", err, "id", request.Id) return oapi.RestoreInstance500JSONResponse{ Code: "internal_error", Message: "failed to restore instance", @@ -417,18 +417,18 @@ func (s *ApiService) GetInstanceLogs(ctx context.Context, request oapi.GetInstan // AttachVolume attaches a volume to an instance (not yet implemented) func (s *ApiService) AttachVolume(ctx context.Context, request oapi.AttachVolumeRequestObject) (oapi.AttachVolumeResponseObject, error) { - return oapi.AttachVolume500JSONResponse{ + return oapi.AttachVolume500JSONResponse{ Code: "not_implemented", Message: "volume attachment not yet implemented", - }, nil + }, nil } // DetachVolume detaches a volume from an instance (not yet implemented) func (s *ApiService) DetachVolume(ctx context.Context, request oapi.DetachVolumeRequestObject) (oapi.DetachVolumeResponseObject, error) { - return oapi.DetachVolume500JSONResponse{ + return oapi.DetachVolume500JSONResponse{ Code: "not_implemented", Message: "volume detachment not yet implemented", - }, nil + }, nil } // instanceToOAPI converts domain Instance to OAPI Instance diff --git a/cmd/api/api/volumes.go b/cmd/api/api/volumes.go index 8b4afd88..6e903801 100644 --- a/cmd/api/api/volumes.go +++ b/cmd/api/api/volumes.go @@ -18,18 +18,18 @@ func (s *ApiService) ListVolumes(ctx context.Context, request oapi.ListVolumesRe domainVols, err := s.VolumeManager.ListVolumes(ctx) if err != nil { - log.Error("failed to list volumes", "error", err) + log.ErrorContext(ctx, "failed to list volumes", "error", err) return oapi.ListVolumes500JSONResponse{ Code: "internal_error", Message: "failed to list volumes", }, nil } - + oapiVols := make([]oapi.Volume, len(domainVols)) for i, vol := range domainVols { oapiVols[i] = volumeToOAPI(vol) } - + return oapi.ListVolumes200JSONResponse(oapiVols), nil } @@ -56,7 +56,7 @@ func (s *ApiService) CreateVolume(ctx context.Context, request oapi.CreateVolume Message: "volume with this ID already exists", }, nil } - log.Error("failed to create volume", "error", err, "name", request.JSONBody.Name) + log.ErrorContext(ctx, "failed to create volume", "error", err, "name", request.JSONBody.Name) return oapi.CreateVolume500JSONResponse{ Code: "internal_error", Message: "failed to create volume", @@ -171,7 +171,7 @@ func (s *ApiService) createVolumeFromMultipart(ctx context.Context, multipartRea Message: "volume with this ID already exists", }, nil } - log.Error("failed to create volume from archive", "error", err, "name", name) + log.ErrorContext(ctx, "failed to create volume from archive", "error", err, "name", name) return oapi.CreateVolume500JSONResponse{ Code: "internal_error", Message: "failed to create volume", @@ -222,7 +222,7 @@ func (s *ApiService) GetVolume(ctx context.Context, request oapi.GetVolumeReques Message: "multiple volumes have this name, use volume ID instead", }, nil default: - log.Error("failed to get volume", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to get volume", "error", err, "id", request.Id) return oapi.GetVolume500JSONResponse{ Code: "internal_error", Message: "failed to get volume", @@ -269,7 +269,7 @@ func (s *ApiService) DeleteVolume(ctx context.Context, request oapi.DeleteVolume Message: "volume is in use by an instance", }, nil default: - log.Error("failed to delete volume", "error", err, "id", request.Id) + log.ErrorContext(ctx, "failed to delete volume", "error", err, "id", request.Id) return oapi.DeleteVolume500JSONResponse{ Code: "internal_error", Message: "failed to delete volume", @@ -302,5 +302,3 @@ func volumeToOAPI(vol volumes.Volume) oapi.Volume { return oapiVol } - - diff --git a/cmd/api/main.go b/cmd/api/main.go index 59413ddf..3951d7b7 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -151,6 +151,13 @@ func run() error { } } + // Create access logger with OTel handler for HTTP request logging with trace correlation + var accessLogHandler slog.Handler + if otelProvider != nil { + accessLogHandler = otelProvider.LogHandler + } + accessLogger := mw.NewAccessLogger(accessLogHandler) + // Load OpenAPI spec for request validation spec, err := oapi.GetSwagger() if err != nil { @@ -162,11 +169,12 @@ func run() error { spec.Servers = nil // Custom exec endpoint (outside OpenAPI spec, uses WebSocket) + // Note: No otelchi here as WebSocket doesn't work well with tracing middleware r.With( middleware.RequestID, middleware.RealIP, - middleware.Logger, middleware.Recoverer, + mw.AccessLogger(accessLogger), mw.JwtAuth(app.Config.JwtSecret), ).Get("/instances/{id}/exec", app.ApiService.ExecHandler) @@ -175,13 +183,18 @@ func run() error { // Common middleware r.Use(middleware.RequestID) r.Use(middleware.RealIP) - r.Use(middleware.Logger) r.Use(middleware.Recoverer) - // OpenTelemetry middleware (inside group to avoid breaking WebSocket/SSE on other routes) + // OpenTelemetry tracing middleware FIRST (creates span context) if cfg.OtelEnabled { r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r))) } + + // Inject logger into request context for handlers to use + r.Use(mw.InjectLogger(accessLogger)) + + // Access logger AFTER otelchi so trace context is available + r.Use(mw.AccessLogger(accessLogger)) if httpMetricsMw != nil { // Skip HTTP metrics for SSE streaming endpoints (logs) r.Use(func(next http.Handler) http.Handler { diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go index 823aef08..c77be233 100644 --- a/lib/middleware/otel.go +++ b/lib/middleware/otel.go @@ -1,10 +1,13 @@ package middleware import ( + "fmt" + "log/slog" "net/http" "time" "github.com/go-chi/chi/v5" + "github.com/onkernel/hypeman/lib/logger" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -97,3 +100,79 @@ func NoopHTTPMetrics() func(http.Handler) http.Handler { return next } } + +// AccessLogger returns a middleware that logs HTTP requests using slog with trace context. +// This replaces chi's middleware.Logger to get logs into OTel/Loki with trace correlation. +func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + + // Wrap response writer to capture status code and bytes + wrapped := &accessLogWriter{ + ResponseWriter: w, + statusCode: http.StatusOK, + } + + // Process request + next.ServeHTTP(wrapped, r) + + // Get route pattern + routePattern := chi.RouteContext(r.Context()).RoutePattern() + if routePattern == "" { + routePattern = r.URL.Path + } + + // Log with trace context from request context + duration := time.Since(start) + log.InfoContext(r.Context(), + fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.statusCode, wrapped.bytesWritten, duration.Milliseconds()), + "method", r.Method, + "path", routePattern, + "status", wrapped.statusCode, + "bytes", wrapped.bytesWritten, + "duration_ms", duration.Milliseconds(), + "remote_addr", r.RemoteAddr, + ) + }) + } +} + +// accessLogWriter wraps http.ResponseWriter to capture status and bytes. +type accessLogWriter struct { + http.ResponseWriter + statusCode int + bytesWritten int +} + +func (w *accessLogWriter) WriteHeader(code int) { + w.statusCode = code + w.ResponseWriter.WriteHeader(code) +} + +func (w *accessLogWriter) Write(b []byte) (int, error) { + n, err := w.ResponseWriter.Write(b) + w.bytesWritten += n + return n, err +} + +func (w *accessLogWriter) Unwrap() http.ResponseWriter { + return w.ResponseWriter +} + +// NewAccessLogger creates an access logger with OTel handler if available. +func NewAccessLogger(otelHandler slog.Handler) *slog.Logger { + cfg := logger.NewConfig() + return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler) +} + +// InjectLogger returns middleware that adds the logger to the request context. +// This enables handlers to use logger.FromContext(ctx) with trace correlation. +func InjectLogger(log *slog.Logger) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := logger.AddToContext(r.Context(), log) + next.ServeHTTP(w, r.WithContext(ctx)) + }) + } +} From d8d45e0296103a950292ed1ca61915a7272a134f Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 16:12:56 -0500 Subject: [PATCH 13/14] Use logging context where we can --- lib/exec/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/exec/client.go b/lib/exec/client.go index 4ca9fd40..a32e6338 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -203,7 +203,7 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti // dialVsock connects to Cloud Hypervisor's vsock Unix socket and performs the handshake func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { - slog.Debug("connecting to vsock", "socket", vsockSocketPath) + slog.DebugContext(ctx, "connecting to vsock", "socket", vsockSocketPath) // Use dial timeout, respecting context deadline if shorter dialTimeout := vsockDialTimeout @@ -220,7 +220,7 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { return nil, fmt.Errorf("dial vsock socket %s: %w", vsockSocketPath, err) } - slog.Debug("connected to vsock socket, performing handshake", "port", vsockGuestPort) + slog.DebugContext(ctx, "connected to vsock socket, performing handshake", "port", vsockGuestPort) // Set deadline for handshake if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil { @@ -255,7 +255,7 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { return nil, fmt.Errorf("vsock handshake failed: %s", response) } - slog.Debug("vsock handshake successful", "response", response) + slog.DebugContext(ctx, "vsock handshake successful", "response", response) // Return wrapped connection that uses the bufio.Reader // This ensures any bytes buffered during handshake are not lost From a02971115a2c6268784af04605a7aa45b57c5bed Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 3 Dec 2025 16:31:49 -0500 Subject: [PATCH 14/14] Simplify --- lib/images/manager.go | 95 ------------ lib/images/metrics.go | 103 +++++++++++++ lib/instances/manager.go | 110 -------------- lib/instances/metrics.go | 118 +++++++++++++++ lib/middleware/otel.go | 126 +++++++++------- lib/network/manager.go | 54 ------- lib/network/metrics.go | 61 ++++++++ lib/otel/metrics.go | 317 --------------------------------------- lib/volumes/manager.go | 96 ------------ lib/volumes/metrics.go | 105 +++++++++++++ 10 files changed, 462 insertions(+), 723 deletions(-) create mode 100644 lib/images/metrics.go create mode 100644 lib/instances/metrics.go create mode 100644 lib/network/metrics.go delete mode 100644 lib/otel/metrics.go create mode 100644 lib/volumes/metrics.go diff --git a/lib/images/manager.go b/lib/images/manager.go index d167c5bd..b637bce5 100644 --- a/lib/images/manager.go +++ b/lib/images/manager.go @@ -10,7 +10,6 @@ import ( "time" "github.com/onkernel/hypeman/lib/paths" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -30,12 +29,6 @@ type Manager interface { RecoverInterruptedBuilds() } -// Metrics holds the metrics instruments for image operations. -type Metrics struct { - buildDuration metric.Float64Histogram - pullsTotal metric.Int64Counter -} - type manager struct { paths *paths.Paths ociClient *ociClient @@ -73,75 +66,6 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Ma return m, nil } -// newMetrics creates and registers all image metrics. -func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) { - buildDuration, err := meter.Float64Histogram( - "hypeman_images_build_duration_seconds", - metric.WithDescription("Time to build an image"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - pullsTotal, err := meter.Int64Counter( - "hypeman_images_pulls_total", - metric.WithDescription("Total number of image pulls from registries"), - ) - if err != nil { - return nil, err - } - - // Register observable gauges for queue length and total images - buildQueueLength, err := meter.Int64ObservableGauge( - "hypeman_images_build_queue_length", - metric.WithDescription("Current number of images in the build queue"), - ) - if err != nil { - return nil, err - } - - imagesTotal, err := meter.Int64ObservableGauge( - "hypeman_images_total", - metric.WithDescription("Total number of cached images"), - ) - if err != nil { - return nil, err - } - - _, err = meter.RegisterCallback( - func(ctx context.Context, o metric.Observer) error { - // Report queue length - o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength())) - - // Count images by status - metas, err := listAllTags(m.paths) - if err != nil { - return nil - } - statusCounts := make(map[string]int64) - for _, meta := range metas { - statusCounts[meta.Status]++ - } - for status, count := range statusCounts { - o.ObserveInt64(imagesTotal, count, - metric.WithAttributes(attribute.String("status", status))) - } - return nil - }, - buildQueueLength, - imagesTotal, - ) - if err != nil { - return nil, err - } - - return &Metrics{ - buildDuration: buildDuration, - pullsTotal: pullsTotal, - }, nil -} - func (m *manager) ListImages(ctx context.Context) ([]Image, error) { metas, err := listAllTags(m.paths) if err != nil { @@ -307,25 +231,6 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) { m.recordBuildMetrics(ctx, buildStart, "success") } -// recordBuildMetrics records the build duration metric. -func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) { - if m.metrics == nil { - return - } - duration := time.Since(start).Seconds() - m.metrics.buildDuration.Record(ctx, duration, - metric.WithAttributes(attribute.String("status", status))) -} - -// recordPullMetrics records the pull counter metric. -func (m *manager) recordPullMetrics(ctx context.Context, status string) { - if m.metrics == nil { - return - } - m.metrics.pullsTotal.Add(ctx, 1, - metric.WithAttributes(attribute.String("status", status))) -} - func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err error) { meta, readErr := readMetadata(m.paths, ref.Repository(), ref.DigestHex()) if readErr != nil { diff --git a/lib/images/metrics.go b/lib/images/metrics.go new file mode 100644 index 00000000..08334ef2 --- /dev/null +++ b/lib/images/metrics.go @@ -0,0 +1,103 @@ +package images + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for image operations. +type Metrics struct { + buildDuration metric.Float64Histogram + pullsTotal metric.Int64Counter +} + +// newMetrics creates and registers all image metrics. +func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + buildDuration, err := meter.Float64Histogram( + "hypeman_images_build_duration_seconds", + metric.WithDescription("Time to build an image"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + pullsTotal, err := meter.Int64Counter( + "hypeman_images_pulls_total", + metric.WithDescription("Total number of image pulls from registries"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges for queue length and total images + buildQueueLength, err := meter.Int64ObservableGauge( + "hypeman_images_build_queue_length", + metric.WithDescription("Current number of images in the build queue"), + ) + if err != nil { + return nil, err + } + + imagesTotal, err := meter.Int64ObservableGauge( + "hypeman_images_total", + metric.WithDescription("Total number of cached images"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + // Report queue length + o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength())) + + // Count images by status + metas, err := listAllTags(m.paths) + if err != nil { + return nil + } + statusCounts := make(map[string]int64) + for _, meta := range metas { + statusCounts[meta.Status]++ + } + for status, count := range statusCounts { + o.ObserveInt64(imagesTotal, count, + metric.WithAttributes(attribute.String("status", status))) + } + return nil + }, + buildQueueLength, + imagesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + buildDuration: buildDuration, + pullsTotal: pullsTotal, + }, nil +} + +// recordBuildMetrics records the build duration metric. +func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.buildDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordPullMetrics records the pull counter metric. +func (m *manager) recordPullMetrics(ctx context.Context, status string) { + if m.metrics == nil { + return + } + m.metrics.pullsTotal.Add(ctx, 1, + metric.WithAttributes(attribute.String("status", status))) +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 68e9b36c..a7050578 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -4,14 +4,12 @@ import ( "context" "fmt" "sync" - "time" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" ) @@ -39,15 +37,6 @@ type ResourceLimits struct { MaxTotalMemory int64 // Maximum total memory in bytes across all instances (0 = unlimited) } -// Metrics holds the metrics instruments for instance operations. -type Metrics struct { - createDuration metric.Float64Histogram - restoreDuration metric.Float64Histogram - standbyDuration metric.Float64Histogram - stateTransitions metric.Int64Counter - tracer trace.Tracer -} - type manager struct { paths *paths.Paths imageManager images.Manager @@ -85,105 +74,6 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste return m } -// newInstanceMetrics creates and registers all instance metrics. -func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) { - createDuration, err := meter.Float64Histogram( - "hypeman_instances_create_duration_seconds", - metric.WithDescription("Time to create an instance"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - restoreDuration, err := meter.Float64Histogram( - "hypeman_instances_restore_duration_seconds", - metric.WithDescription("Time to restore an instance from standby"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - standbyDuration, err := meter.Float64Histogram( - "hypeman_instances_standby_duration_seconds", - metric.WithDescription("Time to put an instance in standby"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - stateTransitions, err := meter.Int64Counter( - "hypeman_instances_state_transitions_total", - metric.WithDescription("Total number of instance state transitions"), - ) - if err != nil { - return nil, err - } - - // Register observable gauge for instance counts by state - instancesTotal, err := meter.Int64ObservableGauge( - "hypeman_instances_total", - metric.WithDescription("Total number of instances by state"), - ) - if err != nil { - return nil, err - } - - _, err = meter.RegisterCallback( - func(ctx context.Context, o metric.Observer) error { - instances, err := m.listInstances(ctx) - if err != nil { - return nil - } - stateCounts := make(map[string]int64) - for _, inst := range instances { - stateCounts[string(inst.State)]++ - } - for state, count := range stateCounts { - o.ObserveInt64(instancesTotal, count, - metric.WithAttributes(attribute.String("state", state))) - } - return nil - }, - instancesTotal, - ) - if err != nil { - return nil, err - } - - return &Metrics{ - createDuration: createDuration, - restoreDuration: restoreDuration, - standbyDuration: standbyDuration, - stateTransitions: stateTransitions, - tracer: tracer, - }, nil -} - -// recordDuration records operation duration. -func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) { - if m.metrics == nil { - return - } - duration := time.Since(start).Seconds() - histogram.Record(ctx, duration, - metric.WithAttributes(attribute.String("status", status))) -} - -// recordStateTransition records a state transition. -func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) { - if m.metrics == nil { - return - } - m.metrics.stateTransitions.Add(ctx, 1, - metric.WithAttributes( - attribute.String("from", fromState), - attribute.String("to", toState), - )) -} - // getInstanceLock returns or creates a lock for a specific instance func (m *manager) getInstanceLock(id string) *sync.RWMutex { lock, _ := m.instanceLocks.LoadOrStore(id, &sync.RWMutex{}) diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go new file mode 100644 index 00000000..3500aa1e --- /dev/null +++ b/lib/instances/metrics.go @@ -0,0 +1,118 @@ +package instances + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// Metrics holds the metrics instruments for instance operations. +type Metrics struct { + createDuration metric.Float64Histogram + restoreDuration metric.Float64Histogram + standbyDuration metric.Float64Histogram + stateTransitions metric.Int64Counter + tracer trace.Tracer +} + +// newInstanceMetrics creates and registers all instance metrics. +func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_instances_create_duration_seconds", + metric.WithDescription("Time to create an instance"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + restoreDuration, err := meter.Float64Histogram( + "hypeman_instances_restore_duration_seconds", + metric.WithDescription("Time to restore an instance from standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + standbyDuration, err := meter.Float64Histogram( + "hypeman_instances_standby_duration_seconds", + metric.WithDescription("Time to put an instance in standby"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + stateTransitions, err := meter.Int64Counter( + "hypeman_instances_state_transitions_total", + metric.WithDescription("Total number of instance state transitions"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for instance counts by state + instancesTotal, err := meter.Int64ObservableGauge( + "hypeman_instances_total", + metric.WithDescription("Total number of instances by state"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + instances, err := m.listInstances(ctx) + if err != nil { + return nil + } + stateCounts := make(map[string]int64) + for _, inst := range instances { + stateCounts[string(inst.State)]++ + } + for state, count := range stateCounts { + o.ObserveInt64(instancesTotal, count, + metric.WithAttributes(attribute.String("state", state))) + } + return nil + }, + instancesTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + restoreDuration: restoreDuration, + standbyDuration: standbyDuration, + stateTransitions: stateTransitions, + tracer: tracer, + }, nil +} + +// recordDuration records operation duration. +func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + histogram.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +} + +// recordStateTransition records a state transition. +func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) { + if m.metrics == nil { + return + } + m.metrics.stateTransitions.Add(ctx, 1, + metric.WithAttributes( + attribute.String("from", fromState), + attribute.String("to", toState), + )) +} diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go index c77be233..cac161c9 100644 --- a/lib/middleware/otel.go +++ b/lib/middleware/otel.go @@ -1,8 +1,10 @@ package middleware import ( + "bufio" "fmt" "log/slog" + "net" "net/http" "time" @@ -48,11 +50,8 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() - // Wrap response writer to capture status code - wrapped := &responseWriter{ - ResponseWriter: w, - statusCode: http.StatusOK, - } + // Wrap response writer to capture status code and bytes + wrapped := wrapResponseWriter(w) // Process request next.ServeHTTP(wrapped, r) @@ -70,7 +69,7 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { attrs := []attribute.KeyValue{ attribute.String("method", r.Method), attribute.String("path", routePattern), - attribute.Int("status", wrapped.statusCode), + attribute.Int("status", wrapped.Status()), } m.requestsTotal.Add(r.Context(), 1, metric.WithAttributes(attrs...)) @@ -78,22 +77,6 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { }) } -// responseWriter wraps http.ResponseWriter to capture the status code. -type responseWriter struct { - http.ResponseWriter - statusCode int -} - -func (rw *responseWriter) WriteHeader(code int) { - rw.statusCode = code - rw.ResponseWriter.WriteHeader(code) -} - -// Unwrap provides access to the underlying ResponseWriter for http.ResponseController. -func (rw *responseWriter) Unwrap() http.ResponseWriter { - return rw.ResponseWriter -} - // NoopHTTPMetrics returns a middleware that does nothing (for when OTel is disabled). func NoopHTTPMetrics() func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { @@ -109,10 +92,7 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler { start := time.Now() // Wrap response writer to capture status code and bytes - wrapped := &accessLogWriter{ - ResponseWriter: w, - statusCode: http.StatusOK, - } + wrapped := wrapResponseWriter(w) // Process request next.ServeHTTP(wrapped, r) @@ -126,11 +106,11 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler { // Log with trace context from request context duration := time.Since(start) log.InfoContext(r.Context(), - fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.statusCode, wrapped.bytesWritten, duration.Milliseconds()), + fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.Status(), wrapped.BytesWritten(), duration.Milliseconds()), "method", r.Method, "path", routePattern, - "status", wrapped.statusCode, - "bytes", wrapped.bytesWritten, + "status", wrapped.Status(), + "bytes", wrapped.BytesWritten(), "duration_ms", duration.Milliseconds(), "remote_addr", r.RemoteAddr, ) @@ -138,28 +118,6 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler { } } -// accessLogWriter wraps http.ResponseWriter to capture status and bytes. -type accessLogWriter struct { - http.ResponseWriter - statusCode int - bytesWritten int -} - -func (w *accessLogWriter) WriteHeader(code int) { - w.statusCode = code - w.ResponseWriter.WriteHeader(code) -} - -func (w *accessLogWriter) Write(b []byte) (int, error) { - n, err := w.ResponseWriter.Write(b) - w.bytesWritten += n - return n, err -} - -func (w *accessLogWriter) Unwrap() http.ResponseWriter { - return w.ResponseWriter -} - // NewAccessLogger creates an access logger with OTel handler if available. func NewAccessLogger(otelHandler slog.Handler) *slog.Logger { cfg := logger.NewConfig() @@ -176,3 +134,69 @@ func InjectLogger(log *slog.Logger) func(http.Handler) http.Handler { }) } } + +// responseWriter wraps http.ResponseWriter to capture status code and bytes written. +// It also implements http.Flusher and http.Hijacker when the underlying writer supports them. +type responseWriter struct { + http.ResponseWriter + statusCode int + bytesWritten int + wroteHeader bool +} + +// wrapResponseWriter creates a new responseWriter wrapper. +func wrapResponseWriter(w http.ResponseWriter) *responseWriter { + return &responseWriter{ + ResponseWriter: w, + statusCode: http.StatusOK, + } +} + +// Status returns the HTTP status code. +func (rw *responseWriter) Status() int { + return rw.statusCode +} + +// BytesWritten returns the number of bytes written. +func (rw *responseWriter) BytesWritten() int { + return rw.bytesWritten +} + +// WriteHeader captures the status code before calling the underlying WriteHeader. +func (rw *responseWriter) WriteHeader(code int) { + if !rw.wroteHeader { + rw.statusCode = code + rw.wroteHeader = true + rw.ResponseWriter.WriteHeader(code) + } +} + +// Write captures bytes written and calls the underlying Write. +func (rw *responseWriter) Write(b []byte) (int, error) { + if !rw.wroteHeader { + rw.WriteHeader(http.StatusOK) + } + n, err := rw.ResponseWriter.Write(b) + rw.bytesWritten += n + return n, err +} + +// Unwrap provides access to the underlying ResponseWriter for http.ResponseController. +func (rw *responseWriter) Unwrap() http.ResponseWriter { + return rw.ResponseWriter +} + +// Flush implements http.Flusher. It delegates to the underlying writer if supported. +func (rw *responseWriter) Flush() { + if f, ok := rw.ResponseWriter.(http.Flusher); ok { + f.Flush() + } +} + +// Hijack implements http.Hijacker. It delegates to the underlying writer if supported. +func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) { + if h, ok := rw.ResponseWriter.(http.Hijacker); ok { + return h.Hijack() + } + return nil, nil, fmt.Errorf("underlying ResponseWriter does not implement http.Hijacker") +} diff --git a/lib/network/manager.go b/lib/network/manager.go index 51e4b13d..a48d3341 100644 --- a/lib/network/manager.go +++ b/lib/network/manager.go @@ -9,7 +9,6 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/paths" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -29,11 +28,6 @@ type Manager interface { NameExists(ctx context.Context, name string) (bool, error) } -// Metrics holds the metrics instruments for network operations. -type Metrics struct { - tapOperations metric.Int64Counter -} - // manager implements the Manager interface type manager struct { paths *paths.Paths @@ -61,54 +55,6 @@ func NewManager(p *paths.Paths, cfg *config.Config, meter metric.Meter) Manager return m } -// newNetworkMetrics creates and registers all network metrics. -func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) { - tapOperations, err := meter.Int64Counter( - "hypeman_network_tap_operations_total", - metric.WithDescription("Total number of TAP device operations"), - ) - if err != nil { - return nil, err - } - - // Register observable gauge for allocations - allocationsTotal, err := meter.Int64ObservableGauge( - "hypeman_network_allocations_total", - metric.WithDescription("Total number of active network allocations"), - ) - if err != nil { - return nil, err - } - - _, err = meter.RegisterCallback( - func(ctx context.Context, o metric.Observer) error { - allocs, err := m.ListAllocations(ctx) - if err != nil { - return nil - } - o.ObserveInt64(allocationsTotal, int64(len(allocs))) - return nil - }, - allocationsTotal, - ) - if err != nil { - return nil, err - } - - return &Metrics{ - tapOperations: tapOperations, - }, nil -} - -// recordTAPOperation records a TAP device operation. -func (m *manager) recordTAPOperation(ctx context.Context, operation string) { - if m.metrics == nil { - return - } - m.metrics.tapOperations.Add(ctx, 1, - metric.WithAttributes(attribute.String("operation", operation))) -} - // Initialize initializes the network manager and creates default network. // runningInstanceIDs should contain IDs of instances currently running (have active VMM). func (m *manager) Initialize(ctx context.Context, runningInstanceIDs []string) error { diff --git a/lib/network/metrics.go b/lib/network/metrics.go new file mode 100644 index 00000000..48cbd1c0 --- /dev/null +++ b/lib/network/metrics.go @@ -0,0 +1,61 @@ +package network + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for network operations. +type Metrics struct { + tapOperations metric.Int64Counter +} + +// newNetworkMetrics creates and registers all network metrics. +func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + tapOperations, err := meter.Int64Counter( + "hypeman_network_tap_operations_total", + metric.WithDescription("Total number of TAP device operations"), + ) + if err != nil { + return nil, err + } + + // Register observable gauge for allocations + allocationsTotal, err := meter.Int64ObservableGauge( + "hypeman_network_allocations_total", + metric.WithDescription("Total number of active network allocations"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + allocs, err := m.ListAllocations(ctx) + if err != nil { + return nil + } + o.ObserveInt64(allocationsTotal, int64(len(allocs))) + return nil + }, + allocationsTotal, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + tapOperations: tapOperations, + }, nil +} + +// recordTAPOperation records a TAP device operation. +func (m *manager) recordTAPOperation(ctx context.Context, operation string) { + if m.metrics == nil { + return + } + m.metrics.tapOperations.Add(ctx, 1, + metric.WithAttributes(attribute.String("operation", operation))) +} diff --git a/lib/otel/metrics.go b/lib/otel/metrics.go deleted file mode 100644 index 6532703d..00000000 --- a/lib/otel/metrics.go +++ /dev/null @@ -1,317 +0,0 @@ -package otel - -import ( - "go.opentelemetry.io/otel/metric" -) - -// ImageMetrics holds metrics for the image manager. -type ImageMetrics struct { - BuildQueueLength metric.Int64ObservableGauge - BuildDuration metric.Float64Histogram - ImagesTotal metric.Int64ObservableGauge - PullsTotal metric.Int64Counter -} - -// NewImageMetrics creates metrics for the image manager. -func NewImageMetrics(meter metric.Meter) (*ImageMetrics, error) { - buildQueueLength, err := meter.Int64ObservableGauge( - "hypeman_images_build_queue_length", - metric.WithDescription("Current number of images in the build queue"), - ) - if err != nil { - return nil, err - } - - buildDuration, err := meter.Float64Histogram( - "hypeman_images_build_duration_seconds", - metric.WithDescription("Time to build an image"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - imagesTotal, err := meter.Int64ObservableGauge( - "hypeman_images_total", - metric.WithDescription("Total number of cached images"), - ) - if err != nil { - return nil, err - } - - pullsTotal, err := meter.Int64Counter( - "hypeman_images_pulls_total", - metric.WithDescription("Total number of image pulls from registries"), - ) - if err != nil { - return nil, err - } - - return &ImageMetrics{ - BuildQueueLength: buildQueueLength, - BuildDuration: buildDuration, - ImagesTotal: imagesTotal, - PullsTotal: pullsTotal, - }, nil -} - -// InstanceMetrics holds metrics for the instance manager. -type InstanceMetrics struct { - InstancesTotal metric.Int64ObservableGauge - CreateDuration metric.Float64Histogram - RestoreDuration metric.Float64Histogram - StandbyDuration metric.Float64Histogram - StateTransitions metric.Int64Counter -} - -// NewInstanceMetrics creates metrics for the instance manager. -func NewInstanceMetrics(meter metric.Meter) (*InstanceMetrics, error) { - instancesTotal, err := meter.Int64ObservableGauge( - "hypeman_instances_total", - metric.WithDescription("Total number of instances by state"), - ) - if err != nil { - return nil, err - } - - createDuration, err := meter.Float64Histogram( - "hypeman_instances_create_duration_seconds", - metric.WithDescription("Time to create an instance"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - restoreDuration, err := meter.Float64Histogram( - "hypeman_instances_restore_duration_seconds", - metric.WithDescription("Time to restore an instance from standby"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - standbyDuration, err := meter.Float64Histogram( - "hypeman_instances_standby_duration_seconds", - metric.WithDescription("Time to put an instance in standby"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - stateTransitions, err := meter.Int64Counter( - "hypeman_instances_state_transitions_total", - metric.WithDescription("Total number of instance state transitions"), - ) - if err != nil { - return nil, err - } - - return &InstanceMetrics{ - InstancesTotal: instancesTotal, - CreateDuration: createDuration, - RestoreDuration: restoreDuration, - StandbyDuration: standbyDuration, - StateTransitions: stateTransitions, - }, nil -} - -// NetworkMetrics holds metrics for the network manager. -type NetworkMetrics struct { - AllocationsTotal metric.Int64ObservableGauge - TapOperations metric.Int64Counter -} - -// NewNetworkMetrics creates metrics for the network manager. -func NewNetworkMetrics(meter metric.Meter) (*NetworkMetrics, error) { - allocationsTotal, err := meter.Int64ObservableGauge( - "hypeman_network_allocations_total", - metric.WithDescription("Total number of active network allocations"), - ) - if err != nil { - return nil, err - } - - tapOperations, err := meter.Int64Counter( - "hypeman_network_tap_operations_total", - metric.WithDescription("Total number of TAP device operations"), - ) - if err != nil { - return nil, err - } - - return &NetworkMetrics{ - AllocationsTotal: allocationsTotal, - TapOperations: tapOperations, - }, nil -} - -// VolumeMetrics holds metrics for the volume manager. -type VolumeMetrics struct { - VolumesTotal metric.Int64ObservableGauge - AllocatedBytes metric.Int64ObservableGauge - UsedBytes metric.Int64ObservableGauge - CreateDuration metric.Float64Histogram -} - -// NewVolumeMetrics creates metrics for the volume manager. -func NewVolumeMetrics(meter metric.Meter) (*VolumeMetrics, error) { - volumesTotal, err := meter.Int64ObservableGauge( - "hypeman_volumes_total", - metric.WithDescription("Total number of volumes"), - ) - if err != nil { - return nil, err - } - - allocatedBytes, err := meter.Int64ObservableGauge( - "hypeman_volumes_allocated_bytes", - metric.WithDescription("Total allocated/provisioned volume size in bytes"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - usedBytes, err := meter.Int64ObservableGauge( - "hypeman_volumes_used_bytes", - metric.WithDescription("Actual disk space consumed by volumes in bytes"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - createDuration, err := meter.Float64Histogram( - "hypeman_volumes_create_duration_seconds", - metric.WithDescription("Time to create a volume"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - return &VolumeMetrics{ - VolumesTotal: volumesTotal, - AllocatedBytes: allocatedBytes, - UsedBytes: usedBytes, - CreateDuration: createDuration, - }, nil -} - -// ExecMetrics holds metrics for the exec client. -type ExecMetrics struct { - SessionsTotal metric.Int64Counter - Duration metric.Float64Histogram - BytesSentTotal metric.Int64Counter - BytesReceivedTotal metric.Int64Counter -} - -// NewExecMetrics creates metrics for the exec client. -func NewExecMetrics(meter metric.Meter) (*ExecMetrics, error) { - sessionsTotal, err := meter.Int64Counter( - "hypeman_exec_sessions_total", - metric.WithDescription("Total number of exec sessions"), - ) - if err != nil { - return nil, err - } - - duration, err := meter.Float64Histogram( - "hypeman_exec_duration_seconds", - metric.WithDescription("Exec command duration"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - bytesSentTotal, err := meter.Int64Counter( - "hypeman_exec_bytes_sent_total", - metric.WithDescription("Total bytes sent to guest (stdin)"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - bytesReceivedTotal, err := meter.Int64Counter( - "hypeman_exec_bytes_received_total", - metric.WithDescription("Total bytes received from guest (stdout+stderr)"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - return &ExecMetrics{ - SessionsTotal: sessionsTotal, - Duration: duration, - BytesSentTotal: bytesSentTotal, - BytesReceivedTotal: bytesReceivedTotal, - }, nil -} - -// VMMMetrics holds metrics for the VMM client. -type VMMMetrics struct { - APIDuration metric.Float64Histogram - APIErrorsTotal metric.Int64Counter -} - -// NewVMMMetrics creates metrics for the VMM client. -func NewVMMMetrics(meter metric.Meter) (*VMMMetrics, error) { - apiDuration, err := meter.Float64Histogram( - "hypeman_vmm_api_duration_seconds", - metric.WithDescription("Cloud Hypervisor API call duration"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - apiErrorsTotal, err := meter.Int64Counter( - "hypeman_vmm_api_errors_total", - metric.WithDescription("Total number of Cloud Hypervisor API errors"), - ) - if err != nil { - return nil, err - } - - return &VMMMetrics{ - APIDuration: apiDuration, - APIErrorsTotal: apiErrorsTotal, - }, nil -} - -// HTTPMetrics holds metrics for HTTP middleware. -type HTTPMetrics struct { - RequestsTotal metric.Int64Counter - RequestDuration metric.Float64Histogram -} - -// NewHTTPMetrics creates metrics for HTTP middleware. -func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) { - requestsTotal, err := meter.Int64Counter( - "hypeman_http_requests_total", - metric.WithDescription("Total number of HTTP requests"), - ) - if err != nil { - return nil, err - } - - requestDuration, err := meter.Float64Histogram( - "hypeman_http_request_duration_seconds", - metric.WithDescription("HTTP request duration"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - return &HTTPMetrics{ - RequestsTotal: requestsTotal, - RequestDuration: requestDuration, - }, nil -} diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go index ac5b375a..d3a3dfc8 100644 --- a/lib/volumes/manager.go +++ b/lib/volumes/manager.go @@ -6,13 +6,11 @@ import ( "io" "os" "sync" - "syscall" "time" "github.com/nrednav/cuid2" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/paths" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -37,11 +35,6 @@ type Manager interface { GetVolumePath(id string) string } -// Metrics holds the metrics instruments for volume operations. -type Metrics struct { - createDuration metric.Float64Histogram -} - type manager struct { paths *paths.Paths maxTotalVolumeStorage int64 // Maximum total volume storage in bytes (0 = unlimited) @@ -70,95 +63,6 @@ func NewManager(p *paths.Paths, maxTotalVolumeStorage int64, meter metric.Meter) return m } -// newVolumeMetrics creates and registers all volume metrics. -func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) { - createDuration, err := meter.Float64Histogram( - "hypeman_volumes_create_duration_seconds", - metric.WithDescription("Time to create a volume"), - metric.WithUnit("s"), - ) - if err != nil { - return nil, err - } - - // Register observable gauges - volumesTotal, err := meter.Int64ObservableGauge( - "hypeman_volumes_total", - metric.WithDescription("Total number of volumes"), - ) - if err != nil { - return nil, err - } - - allocatedBytes, err := meter.Int64ObservableGauge( - "hypeman_volumes_allocated_bytes", - metric.WithDescription("Total allocated/provisioned volume size in bytes"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - usedBytes, err := meter.Int64ObservableGauge( - "hypeman_volumes_used_bytes", - metric.WithDescription("Actual disk space consumed by volumes in bytes"), - metric.WithUnit("By"), - ) - if err != nil { - return nil, err - } - - _, err = meter.RegisterCallback( - func(ctx context.Context, o metric.Observer) error { - volumes, err := m.ListVolumes(ctx) - if err != nil { - return nil - } - - o.ObserveInt64(volumesTotal, int64(len(volumes))) - - var totalAllocated, totalUsed int64 - for _, vol := range volumes { - // Allocated = provisioned size in GB - totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024 - - // Used = actual disk blocks consumed (for sparse files) - diskPath := m.paths.VolumeData(vol.Id) - if stat, err := os.Stat(diskPath); err == nil { - // Get actual blocks used via syscall - if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok { - totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units - } - } - } - - o.ObserveInt64(allocatedBytes, totalAllocated) - o.ObserveInt64(usedBytes, totalUsed) - return nil - }, - volumesTotal, - allocatedBytes, - usedBytes, - ) - if err != nil { - return nil, err - } - - return &Metrics{ - createDuration: createDuration, - }, nil -} - -// recordCreateDuration records the volume creation duration. -func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) { - if m.metrics == nil { - return - } - duration := time.Since(start).Seconds() - m.metrics.createDuration.Record(ctx, duration, - metric.WithAttributes(attribute.String("status", status))) -} - // getVolumeLock returns or creates a lock for a specific volume func (m *manager) getVolumeLock(id string) *sync.RWMutex { lock, _ := m.volumeLocks.LoadOrStore(id, &sync.RWMutex{}) diff --git a/lib/volumes/metrics.go b/lib/volumes/metrics.go new file mode 100644 index 00000000..ad8db491 --- /dev/null +++ b/lib/volumes/metrics.go @@ -0,0 +1,105 @@ +package volumes + +import ( + "context" + "os" + "syscall" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the metrics instruments for volume operations. +type Metrics struct { + createDuration metric.Float64Histogram +} + +// newVolumeMetrics creates and registers all volume metrics. +func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) { + createDuration, err := meter.Float64Histogram( + "hypeman_volumes_create_duration_seconds", + metric.WithDescription("Time to create a volume"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + // Register observable gauges + volumesTotal, err := meter.Int64ObservableGauge( + "hypeman_volumes_total", + metric.WithDescription("Total number of volumes"), + ) + if err != nil { + return nil, err + } + + allocatedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_allocated_bytes", + metric.WithDescription("Total allocated/provisioned volume size in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + usedBytes, err := meter.Int64ObservableGauge( + "hypeman_volumes_used_bytes", + metric.WithDescription("Actual disk space consumed by volumes in bytes"), + metric.WithUnit("By"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + volumes, err := m.ListVolumes(ctx) + if err != nil { + return nil + } + + o.ObserveInt64(volumesTotal, int64(len(volumes))) + + var totalAllocated, totalUsed int64 + for _, vol := range volumes { + // Allocated = provisioned size in GB + totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024 + + // Used = actual disk blocks consumed (for sparse files) + diskPath := m.paths.VolumeData(vol.Id) + if stat, err := os.Stat(diskPath); err == nil { + // Get actual blocks used via syscall + if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok { + totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units + } + } + } + + o.ObserveInt64(allocatedBytes, totalAllocated) + o.ObserveInt64(usedBytes, totalUsed) + return nil + }, + volumesTotal, + allocatedBytes, + usedBytes, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + createDuration: createDuration, + }, nil +} + +// recordCreateDuration records the volume creation duration. +func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + m.metrics.createDuration.Record(ctx, duration, + metric.WithAttributes(attribute.String("status", status))) +}