From 8658eca200924f15096536ac54543bf17280009b Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 13:10:26 -0500
Subject: [PATCH 01/14] Add tracing, metrics, configurable logging

---
 cmd/api/api/api_test.go               |   8 +-
 cmd/api/config/config.go              |  34 ++-
 cmd/api/main.go                       |  63 ++++-
 go.mod                                |  19 +-
 go.sum                                |  27 ++-
 lib/exec/client.go                    |  16 +-
 lib/exec/metrics.go                   | 105 +++++++++
 lib/images/manager.go                 | 121 +++++++++-
 lib/images/manager_test.go            |  30 +--
 lib/images/queue.go                   |   6 +
 lib/instances/create.go               |  26 ++-
 lib/instances/exec_test.go            |   3 +-
 lib/instances/manager.go              | 132 ++++++++++-
 lib/instances/manager_test.go         | 142 ++++++------
 lib/instances/resource_limits_test.go |  23 +-
 lib/instances/restore.go              |  25 +-
 lib/instances/standby.go              |  48 ++--
 lib/instances/volumes_test.go         |  12 +-
 lib/logger/README.md                  |  55 +++++
 lib/logger/logger.go                  | 153 ++++++++++++-
 lib/middleware/otel.go                |  99 ++++++++
 lib/network/allocate.go               |   5 +-
 lib/network/manager.go                |  80 ++++++-
 lib/otel/README.md                    |  92 ++++++++
 lib/otel/metrics.go                   | 317 ++++++++++++++++++++++++++
 lib/otel/otel.go                      | 214 +++++++++++++++++
 lib/providers/providers.go            |  16 +-
 lib/vmm/metrics.go                    |  66 ++++++
 lib/volumes/manager.go                | 123 +++++++++-
 lib/volumes/manager_test.go           |   3 +-
 30 files changed, 1881 insertions(+), 182 deletions(-)
 create mode 100644 lib/exec/metrics.go
 create mode 100644 lib/logger/README.md
 create mode 100644 lib/middleware/otel.go
 create mode 100644 lib/otel/README.md
 create mode 100644 lib/otel/metrics.go
 create mode 100644 lib/otel/otel.go
 create mode 100644 lib/vmm/metrics.go

diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go
index ab536869..9f010002 100644
--- a/cmd/api/api/api_test.go
+++ b/cmd/api/api/api_test.go
@@ -26,18 +26,18 @@ func newTestService(t *testing.T) *ApiService {
 	}
 
 	p := paths.New(cfg.DataDir)
-	imageMgr, err := images.NewManager(p, 1)
+	imageMgr, err := images.NewManager(p, 1, nil)
 	if err != nil {
 		t.Fatalf("failed to create image manager: %v", err)
 	}
 
 	systemMgr := system.NewManager(p)
-	networkMgr := network.NewManager(p, cfg)
-	volumeMgr := volumes.NewManager(p, 0) // 0 = unlimited storage
+	networkMgr := network.NewManager(p, cfg, nil)
+	volumeMgr := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
 	limits := instances.ResourceLimits{
 		MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB
 	}
-	instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits)
+	instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil)
 
 	// Register cleanup for orphaned Cloud Hypervisor processes
 	t.Cleanup(func() {
diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go
index 0b59f6f0..ec810d7e 100644
--- a/cmd/api/config/config.go
+++ b/cmd/api/config/config.go
@@ -23,13 +23,23 @@ type Config struct {
 	LogRotateInterval   string
 
 	// Resource limits - per instance
-	MaxVcpusPerInstance   int    // Max vCPUs for a single VM (0 = unlimited)
-	MaxMemoryPerInstance  string // Max memory for a single VM (0 = unlimited)
+	MaxVcpusPerInstance  int    // Max vCPUs for a single VM (0 = unlimited)
+	MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited)
 
 	// Resource limits - aggregate
 	MaxTotalVcpus         int    // Aggregate vCPU limit across all instances (0 = unlimited)
 	MaxTotalMemory        string // Aggregate memory limit across all instances (0 = unlimited)
 	MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited)
+
+	// OpenTelemetry configuration
+	OtelEnabled     bool   // Enable OpenTelemetry
+	OtelEndpoint    string // OTLP endpoint (gRPC)
+	OtelServiceName string // Service name for tracing
+	OtelInsecure    bool   // Disable TLS for OTLP
+	Version         string // Application version for telemetry
+
+	// Logging configuration
+	LogLevel string // Default log level (debug, info, warn, error)
 }
 
 // Load loads configuration from environment variables
@@ -43,7 +53,7 @@ func Load() *Config {
 		DataDir:             getEnv("DATA_DIR", "/var/lib/hypeman"),
 		BridgeName:          getEnv("BRIDGE_NAME", "vmbr0"),
 		SubnetCIDR:          getEnv("SUBNET_CIDR", "10.100.0.0/16"),
-		SubnetGateway:       getEnv("SUBNET_GATEWAY", ""), // empty = derived as first IP from subnet
+		SubnetGateway:       getEnv("SUBNET_GATEWAY", ""),   // empty = derived as first IP from subnet
 		UplinkInterface:     getEnv("UPLINK_INTERFACE", ""), // empty = auto-detect from default route
 		JwtSecret:           getEnv("JWT_SECRET", ""),
 		DNSServer:           getEnv("DNS_SERVER", "1.1.1.1"),
@@ -61,6 +71,16 @@ func Load() *Config {
 		MaxTotalVcpus:         getEnvInt("MAX_TOTAL_VCPUS", 0),
 		MaxTotalMemory:        getEnv("MAX_TOTAL_MEMORY", ""),
 		MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""),
+
+		// OpenTelemetry configuration
+		OtelEnabled:     getEnvBool("OTEL_ENABLED", false),
+		OtelEndpoint:    getEnv("OTEL_ENDPOINT", "localhost:4317"),
+		OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"),
+		OtelInsecure:    getEnvBool("OTEL_INSECURE", true),
+		Version:         getEnv("VERSION", "dev"),
+
+		// Logging configuration
+		LogLevel: getEnv("LOG_LEVEL", "info"),
 	}
 
 	return cfg
@@ -82,3 +102,11 @@ func getEnvInt(key string, defaultValue int) int {
 	return defaultValue
 }
 
+func getEnvBool(key string, defaultValue bool) bool {
+	if value := os.Getenv(key); value != "" {
+		if boolVal, err := strconv.ParseBool(value); err == nil {
+			return boolVal
+		}
+	}
+	return defaultValue
+}
diff --git a/cmd/api/main.go b/cmd/api/main.go
index d2ad80f2..4ff139b1 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -20,9 +20,13 @@ import (
 	nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware"
 	"github.com/onkernel/hypeman"
 	"github.com/onkernel/hypeman/cmd/api/api"
+	"github.com/onkernel/hypeman/cmd/api/config"
+	"github.com/onkernel/hypeman/lib/exec"
 	"github.com/onkernel/hypeman/lib/instances"
 	mw "github.com/onkernel/hypeman/lib/middleware"
 	"github.com/onkernel/hypeman/lib/oapi"
+	"github.com/onkernel/hypeman/lib/otel"
+	"github.com/riandyrn/otelchi"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -34,6 +38,41 @@ func main() {
 }
 
 func run() error {
+	// Load config early for OTel initialization
+	cfg := config.Load()
+
+	// Initialize OpenTelemetry (before wire initialization)
+	otelCfg := otel.Config{
+		Enabled:     cfg.OtelEnabled,
+		Endpoint:    cfg.OtelEndpoint,
+		ServiceName: cfg.OtelServiceName,
+		Insecure:    cfg.OtelInsecure,
+		Version:     cfg.Version,
+	}
+
+	otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg)
+	if err != nil {
+		// Log warning but don't fail - graceful degradation
+		slog.Warn("failed to initialize OpenTelemetry, continuing without telemetry", "error", err)
+	}
+	if otelShutdown != nil {
+		defer func() {
+			shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			if err := otelShutdown(shutdownCtx); err != nil {
+				slog.Warn("error shutting down OpenTelemetry", "error", err)
+			}
+		}()
+	}
+
+	// Initialize exec metrics if OTel is enabled
+	if otelProvider != nil && otelProvider.Meter != nil {
+		execMetrics, err := exec.NewMetrics(otelProvider.Meter)
+		if err == nil {
+			exec.SetMetrics(execMetrics)
+		}
+	}
+
 	// Initialize app with wire
 	app, cleanup, err := initializeApp()
 	if err != nil {
@@ -46,6 +85,11 @@ func run() error {
 
 	logger := app.Logger
 
+	// Log OTel status
+	if cfg.OtelEnabled {
+		logger.Info("OpenTelemetry enabled", "endpoint", cfg.OtelEndpoint, "service", cfg.OtelServiceName)
+	}
+
 	// Validate JWT secret is configured
 	if app.Config.JwtSecret == "" {
 		logger.Warn("JWT_SECRET not configured - API authentication will fail")
@@ -84,6 +128,24 @@ func run() error {
 	// Create router
 	r := chi.NewRouter()
 
+	// Add OpenTelemetry tracing middleware if enabled
+	if cfg.OtelEnabled {
+		r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r)))
+	}
+
+	// Add HTTP metrics middleware if OTel is enabled
+	var httpMetricsMw func(http.Handler) http.Handler
+	if otelProvider != nil && otelProvider.Meter != nil {
+		httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter)
+		if err == nil {
+			httpMetricsMw = httpMetrics.Middleware
+		}
+	}
+	if httpMetricsMw == nil {
+		httpMetricsMw = mw.NoopHTTPMetrics()
+	}
+	r.Use(httpMetricsMw)
+
 	// Load OpenAPI spec for request validation
 	spec, err := oapi.GetSwagger()
 	if err != nil {
@@ -225,4 +287,3 @@ func getRunningInstanceIDs(app *application) []string {
 	}
 	return running
 }
-
diff --git a/go.mod b/go.mod
index 0f4f6cb0..7c894df1 100644
--- a/go.mod
+++ b/go.mod
@@ -22,10 +22,19 @@ require (
 	github.com/opencontainers/image-spec v1.1.1
 	github.com/opencontainers/runtime-spec v1.2.1
 	github.com/opencontainers/umoci v0.6.0
+	github.com/riandyrn/otelchi v0.12.2
 	github.com/samber/lo v1.52.0
 	github.com/stretchr/testify v1.11.1
 	github.com/u-root/u-root v0.15.0
 	github.com/vishvananda/netlink v1.3.1
+	go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0
+	go.opentelemetry.io/otel v1.38.0
+	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0
+	go.opentelemetry.io/otel/metric v1.38.0
+	go.opentelemetry.io/otel/sdk v1.38.0
+	go.opentelemetry.io/otel/sdk/metric v1.38.0
+	go.opentelemetry.io/otel/trace v1.38.0
 	golang.org/x/sync v0.17.0
 	golang.org/x/sys v0.38.0
 	golang.org/x/term v0.37.0
@@ -39,17 +48,22 @@ require (
 	github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
 	github.com/apex/log v1.9.0 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
 	github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/docker/cli v28.2.2+incompatible // indirect
 	github.com/docker/distribution v2.8.3+incompatible // indirect
 	github.com/docker/docker-credential-helpers v0.9.3 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
 	github.com/go-test/deep v1.1.1 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/mux v1.8.1 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/klauspost/compress v1.18.0 // indirect
 	github.com/klauspost/pgzip v1.2.6 // indirect
@@ -66,7 +80,6 @@ require (
 	github.com/pierrec/lz4/v4 v4.1.22 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/rogpeppe/go-internal v1.13.1 // indirect
 	github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect
 	github.com/sirupsen/logrus v1.9.4-0.20230606125235-dd1b4c2e81af // indirect
 	github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 // indirect
@@ -74,9 +87,13 @@ require (
 	github.com/vbatts/tar-split v0.12.1 // indirect
 	github.com/vishvananda/netns v0.0.5 // indirect
 	github.com/woodsbury/decimal128 v1.3.0 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
+	go.opentelemetry.io/proto/otlp v1.7.1 // indirect
 	golang.org/x/crypto v0.43.0 // indirect
 	golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect
 	golang.org/x/text v0.30.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
diff --git a/go.sum b/go.sum
index ac155882..91d5185a 100644
--- a/go.sum
+++ b/go.sum
@@ -15,6 +15,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y
 github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
 github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 h1:6lhrsTEnloDPXyeZBvSYvQf8u86jbKehZPVDDlkgDl4=
 github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M=
+github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
+github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
 github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8=
 github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU=
 github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
@@ -36,6 +38,8 @@ github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/getkin/kin-openapi v0.133.0 h1:pJdmNohVIJ97r4AUFtEXRXwESr8b0bD721u/Tz6k8PQ=
 github.com/getkin/kin-openapi v0.133.0/go.mod h1:boAciF6cXk5FhPqe/NQeBTeenbjqU4LhWBf09ILVvWE=
@@ -44,6 +48,7 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME
 github.com/go-chi/chi/v5 v5.2.3 h1:WQIt9uxdsAbgIYgid+BpYc+liqQZGMHRaUwp0JUcvdE=
 github.com/go-chi/chi/v5 v5.2.3/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops=
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
@@ -75,6 +80,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
 github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
 github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
@@ -144,9 +151,11 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/riandyrn/otelchi v0.12.2 h1:6QhGv0LVw/dwjtPd12mnNrl0oEQF4ZAlmHcnlTYbeAg=
+github.com/riandyrn/otelchi v0.12.2/go.mod h1:weZZeUJURvtCcbWsdb7Y6F8KFZGedJlSrgUjq9VirV8=
 github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
-github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
+github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
+github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
 github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 h1:58jvc5cZ+hGKidQ4Z37/+rj9eQxRRjOOsqNEwPSZXR4=
 github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67/go.mod h1:LLjEAc6zmycfeN7/1fxIphWQPjHpTt7ElqT7eVf8e4A=
 github.com/samber/lo v1.52.0 h1:Rvi+3BFHES3A8meP33VPAxiBZX/Aws5RxrschYGjomw=
@@ -189,8 +198,16 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj
 github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ=
+go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw=
 go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
 go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
 go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
 go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
 go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
@@ -199,6 +216,10 @@ go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6
 go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
 go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
 go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
+go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
+go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
@@ -230,6 +251,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
 gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
+google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
+google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 h1:M1rk8KBnUsBDg1oPGHNCxG4vc1f49epmTO7xscSajMk=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
 google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
diff --git a/lib/exec/client.go b/lib/exec/client.go
index 0484bc96..4ca9fd40 100644
--- a/lib/exec/client.go
+++ b/lib/exec/client.go
@@ -9,6 +9,7 @@ import (
 	"net"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"google.golang.org/grpc"
@@ -111,6 +112,9 @@ func (c *bufferedConn) Read(p []byte) (int, error) {
 // ExecIntoInstance executes command in instance via vsock using gRPC
 // vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock)
 func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) {
+	start := time.Now()
+	var bytesSent int64
+
 	// Get or create a reusable gRPC connection for this vsock socket
 	// Connection pooling avoids issues with rapid connect/disconnect cycles
 	grpcConn, err := getOrCreateConn(ctx, vsockSocketPath)
@@ -146,13 +150,14 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti
 	// Handle stdin in background
 	if opts.Stdin != nil {
 		go func() {
-			buf := make([]byte, 32 * 1024)
+			buf := make([]byte, 32*1024)
 			for {
 				n, err := opts.Stdin.Read(buf)
 				if n > 0 {
 					stream.Send(&ExecRequest{
 						Request: &ExecRequest_Stdin{Stdin: buf[:n]},
 					})
+					atomic.AddInt64(&bytesSent, int64(n))
 				}
 				if err != nil {
 					stream.CloseSend()
@@ -185,7 +190,13 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti
 				opts.Stderr.Write(r.Stderr)
 			}
 		case *ExecResponse_ExitCode:
-			return &ExitStatus{Code: int(r.ExitCode)}, nil
+			exitCode := int(r.ExitCode)
+			// Record metrics
+			if ExecMetrics != nil {
+				bytesReceived := int64(totalStdout + totalStderr)
+				ExecMetrics.RecordSession(ctx, start, exitCode, atomic.LoadInt64(&bytesSent), bytesReceived)
+			}
+			return &ExitStatus{Code: exitCode}, nil
 		}
 	}
 }
@@ -250,4 +261,3 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) {
 	// This ensures any bytes buffered during handshake are not lost
 	return &bufferedConn{Conn: conn, reader: reader}, nil
 }
-
diff --git a/lib/exec/metrics.go b/lib/exec/metrics.go
new file mode 100644
index 00000000..ef1c83c7
--- /dev/null
+++ b/lib/exec/metrics.go
@@ -0,0 +1,105 @@
+package exec
+
+import (
+	"context"
+	"time"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Metrics holds the metrics instruments for exec operations.
+type Metrics struct {
+	sessionsTotal      metric.Int64Counter
+	duration           metric.Float64Histogram
+	bytesSentTotal     metric.Int64Counter
+	bytesReceivedTotal metric.Int64Counter
+}
+
+// ExecMetrics is the global metrics instance for the exec package.
+// Set this via SetMetrics() during application initialization.
+var ExecMetrics *Metrics
+
+// SetMetrics sets the global metrics instance.
+func SetMetrics(m *Metrics) {
+	ExecMetrics = m
+}
+
+// NewMetrics creates exec metrics instruments.
+// If meter is nil, returns nil (metrics disabled).
+func NewMetrics(meter metric.Meter) (*Metrics, error) {
+	if meter == nil {
+		return nil, nil
+	}
+
+	sessionsTotal, err := meter.Int64Counter(
+		"hypeman_exec_sessions_total",
+		metric.WithDescription("Total number of exec sessions"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	duration, err := meter.Float64Histogram(
+		"hypeman_exec_duration_seconds",
+		metric.WithDescription("Exec command duration"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	bytesSentTotal, err := meter.Int64Counter(
+		"hypeman_exec_bytes_sent_total",
+		metric.WithDescription("Total bytes sent to guest (stdin)"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	bytesReceivedTotal, err := meter.Int64Counter(
+		"hypeman_exec_bytes_received_total",
+		metric.WithDescription("Total bytes received from guest (stdout+stderr)"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		sessionsTotal:      sessionsTotal,
+		duration:           duration,
+		bytesSentTotal:     bytesSentTotal,
+		bytesReceivedTotal: bytesReceivedTotal,
+	}, nil
+}
+
+// RecordSession records metrics for a completed exec session.
+func (m *Metrics) RecordSession(ctx context.Context, start time.Time, exitCode int, bytesSent, bytesReceived int64) {
+	if m == nil {
+		return
+	}
+
+	duration := time.Since(start).Seconds()
+	status := "success"
+	if exitCode != 0 {
+		status = "error"
+	}
+
+	m.sessionsTotal.Add(ctx, 1,
+		metric.WithAttributes(
+			attribute.String("status", status),
+			attribute.Int("exit_code", exitCode),
+		))
+
+	m.duration.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+
+	if bytesSent > 0 {
+		m.bytesSentTotal.Add(ctx, bytesSent)
+	}
+	if bytesReceived > 0 {
+		m.bytesReceivedTotal.Add(ctx, bytesReceived)
+	}
+}
diff --git a/lib/images/manager.go b/lib/images/manager.go
index 8d6a403b..d167c5bd 100644
--- a/lib/images/manager.go
+++ b/lib/images/manager.go
@@ -10,6 +10,8 @@ import (
 	"time"
 
 	"github.com/onkernel/hypeman/lib/paths"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
 )
 
 const (
@@ -28,15 +30,23 @@ type Manager interface {
 	RecoverInterruptedBuilds()
 }
 
+// Metrics holds the metrics instruments for image operations.
+type Metrics struct {
+	buildDuration metric.Float64Histogram
+	pullsTotal    metric.Int64Counter
+}
+
 type manager struct {
 	paths     *paths.Paths
 	ociClient *ociClient
 	queue     *BuildQueue
 	createMu  sync.Mutex
+	metrics   *Metrics
 }
 
-// NewManager creates a new image manager
-func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) {
+// NewManager creates a new image manager.
+// If meter is nil, metrics are disabled.
+func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Manager, error) {
 	// Create cache directory under dataDir for OCI layouts
 	cacheDir := p.SystemOCICache()
 	ociClient, err := newOCIClient(cacheDir)
@@ -49,10 +59,89 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int) (Manager, error) {
 		ociClient: ociClient,
 		queue:     NewBuildQueue(maxConcurrentBuilds),
 	}
+
+	// Initialize metrics if meter is provided
+	if meter != nil {
+		metrics, err := newMetrics(meter, m)
+		if err != nil {
+			return nil, fmt.Errorf("create metrics: %w", err)
+		}
+		m.metrics = metrics
+	}
+
 	m.RecoverInterruptedBuilds()
 	return m, nil
 }
 
+// newMetrics creates and registers all image metrics.
+func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	buildDuration, err := meter.Float64Histogram(
+		"hypeman_images_build_duration_seconds",
+		metric.WithDescription("Time to build an image"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	pullsTotal, err := meter.Int64Counter(
+		"hypeman_images_pulls_total",
+		metric.WithDescription("Total number of image pulls from registries"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauges for queue length and total images
+	buildQueueLength, err := meter.Int64ObservableGauge(
+		"hypeman_images_build_queue_length",
+		metric.WithDescription("Current number of images in the build queue"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	imagesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_images_total",
+		metric.WithDescription("Total number of cached images"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			// Report queue length
+			o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength()))
+
+			// Count images by status
+			metas, err := listAllTags(m.paths)
+			if err != nil {
+				return nil
+			}
+			statusCounts := make(map[string]int64)
+			for _, meta := range metas {
+				statusCounts[meta.Status]++
+			}
+			for status, count := range statusCounts {
+				o.ObserveInt64(imagesTotal, count,
+					metric.WithAttributes(attribute.String("status", status)))
+			}
+			return nil
+		},
+		buildQueueLength,
+		imagesTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		buildDuration: buildDuration,
+		pullsTotal:    pullsTotal,
+	}, nil
+}
+
 func (m *manager) ListImages(ctx context.Context) ([]Image, error) {
 	metas, err := listAllTags(m.paths)
 	if err != nil {
@@ -78,7 +167,7 @@ func (m *manager) CreateImage(ctx context.Context, req CreateImageRequest) (*Ima
 	// Add a 2-second timeout to ensure fast failure on rate limits or errors
 	resolveCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
 	defer cancel()
-	
+
 	ref, err := normalized.Resolve(resolveCtx, m.ociClient)
 	if err != nil {
 		return nil, fmt.Errorf("resolve manifest: %w", err)
@@ -134,11 +223,13 @@ func (m *manager) createAndQueueImage(ref *ResolvedRef) (*Image, error) {
 }
 
 func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) {
+	buildStart := time.Now()
 	buildDir := m.paths.SystemBuild(ref.String())
 	tempDir := filepath.Join(buildDir, "rootfs")
 
 	if err := os.MkdirAll(buildDir, 0755); err != nil {
 		m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("create build dir: %w", err))
+		m.recordBuildMetrics(ctx, buildStart, "failed")
 		return
 	}
 
@@ -153,8 +244,11 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) {
 	result, err := m.ociClient.pullAndExport(ctx, ref.String(), ref.Digest(), tempDir)
 	if err != nil {
 		m.updateStatusByDigest(ref, StatusFailed, fmt.Errorf("pull and export: %w", err))
+		m.recordPullMetrics(ctx, "failed")
+		m.recordBuildMetrics(ctx, buildStart, "failed")
 		return
 	}
+	m.recordPullMetrics(ctx, "success")
 
 	// Check if this digest already exists and is ready (deduplication)
 	if meta, err := readMetadata(m.paths, ref.Repository(), ref.DigestHex()); err == nil {
@@ -209,6 +303,27 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) {
 			fmt.Fprintf(os.Stderr, "Warning: failed to create tag symlink: %v\n", err)
 		}
 	}
+
+	m.recordBuildMetrics(ctx, buildStart, "success")
+}
+
+// recordBuildMetrics records the build duration metric.
+func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	m.metrics.buildDuration.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+}
+
+// recordPullMetrics records the pull counter metric.
+func (m *manager) recordPullMetrics(ctx context.Context, status string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.pullsTotal.Add(ctx, 1,
+		metric.WithAttributes(attribute.String("status", status)))
 }
 
 func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err error) {
diff --git a/lib/images/manager_test.go b/lib/images/manager_test.go
index 984a7ba3..406566ef 100644
--- a/lib/images/manager_test.go
+++ b/lib/images/manager_test.go
@@ -14,7 +14,7 @@ import (
 
 func TestCreateImage(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -41,7 +41,7 @@ func TestCreateImage(t *testing.T) {
 	ref, err := ParseNormalizedRef(img.Name)
 	require.NoError(t, err)
 	digestHex := strings.SplitN(img.Digest, ":", 2)[1]
-	
+
 	// Check erofs disk file
 	diskPath := digestPath(paths.New(dataDir), ref.Repository(), digestHex)
 	diskStat, err := os.Stat(diskPath)
@@ -56,7 +56,7 @@ func TestCreateImage(t *testing.T) {
 	metaStat, err := os.Stat(metadataPath)
 	require.NoError(t, err)
 	require.False(t, metaStat.IsDir(), "metadata should be a file")
-	
+
 	// Read and verify metadata content
 	meta, err := readMetadata(paths.New(dataDir), ref.Repository(), digestHex)
 	require.NoError(t, err)
@@ -66,7 +66,7 @@ func TestCreateImage(t *testing.T) {
 	require.Nil(t, meta.Error)
 	require.Equal(t, diskStat.Size(), meta.SizeBytes)
 	require.NotEmpty(t, meta.Env, "should have environment variables")
-	t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d", 
+	t.Logf("Metadata: name=%s, digest=%s, status=%s, env_vars=%d",
 		meta.Name, meta.Digest, meta.Status, len(meta.Env))
 
 	// Check that tag symlink exists and points to correct digest
@@ -74,7 +74,7 @@ func TestCreateImage(t *testing.T) {
 	linkStat, err := os.Lstat(linkPath)
 	require.NoError(t, err)
 	require.NotEqual(t, 0, linkStat.Mode()&os.ModeSymlink, "should be a symlink")
-	
+
 	// Verify symlink points to digest directory
 	linkTarget, err := os.Readlink(linkPath)
 	require.NoError(t, err)
@@ -84,7 +84,7 @@ func TestCreateImage(t *testing.T) {
 
 func TestCreateImageDifferentTag(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -98,7 +98,7 @@ func TestCreateImageDifferentTag(t *testing.T) {
 	require.Equal(t, "docker.io/library/alpine:3.18", img.Name)
 
 	waitForReady(t, mgr, ctx, img.Name)
-	
+
 	img, err = mgr.GetImage(ctx, img.Name)
 	require.NoError(t, err)
 	require.NotEmpty(t, img.Digest)
@@ -106,7 +106,7 @@ func TestCreateImageDifferentTag(t *testing.T) {
 
 func TestCreateImageDuplicate(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -135,7 +135,7 @@ func TestCreateImageDuplicate(t *testing.T) {
 
 func TestListImages(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -164,7 +164,7 @@ func TestListImages(t *testing.T) {
 
 func TestGetImage(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -188,7 +188,7 @@ func TestGetImage(t *testing.T) {
 
 func TestGetImageNotFound(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -199,7 +199,7 @@ func TestGetImageNotFound(t *testing.T) {
 
 func TestDeleteImage(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -234,7 +234,7 @@ func TestDeleteImage(t *testing.T) {
 
 func TestDeleteImageNotFound(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 
 	ctx := context.Background()
@@ -271,7 +271,7 @@ func TestNormalizedRefParsing(t *testing.T) {
 
 func TestLayerCaching(t *testing.T) {
 	dataDir := t.TempDir()
-	mgr, err := NewManager(paths.New(dataDir), 1)
+	mgr, err := NewManager(paths.New(dataDir), 1, nil)
 	require.NoError(t, err)
 	ctx := context.Background()
 
@@ -326,7 +326,7 @@ func TestLayerCaching(t *testing.T) {
 	digestHex := strings.TrimPrefix(alpine1.Digest, "sha256:")
 	disk1 := digestPath(paths.New(dataDir), alpine1Parsed.Repository(), digestHex)
 	disk2 := digestPath(paths.New(dataDir), alpine2Parsed.Repository(), digestHex)
-	
+
 	require.Equal(t, disk1, disk2, "both references should point to same disk")
 
 	stat, err := os.Stat(disk1)
diff --git a/lib/images/queue.go b/lib/images/queue.go
index 115d267c..5d08100a 100644
--- a/lib/images/queue.go
+++ b/lib/images/queue.go
@@ -113,3 +113,9 @@ func (q *BuildQueue) PendingCount() int {
 	return len(q.pending)
 }
 
+// QueueLength returns the total number of builds (active + pending)
+func (q *BuildQueue) QueueLength() int {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	return len(q.active) + len(q.pending)
+}
diff --git a/lib/instances/create.go b/lib/instances/create.go
index faa32279..b82e421d 100644
--- a/lib/instances/create.go
+++ b/lib/instances/create.go
@@ -15,6 +15,7 @@ import (
 	"github.com/onkernel/hypeman/lib/system"
 	"github.com/onkernel/hypeman/lib/vmm"
 	"github.com/onkernel/hypeman/lib/volumes"
+	"go.opentelemetry.io/otel/trace"
 	"gvisor.dev/gvisor/pkg/cleanup"
 )
 
@@ -93,9 +94,17 @@ func (m *manager) createInstance(
 	ctx context.Context,
 	req CreateInstanceRequest,
 ) (*Instance, error) {
+	start := time.Now()
 	log := logger.FromContext(ctx)
 	log.InfoContext(ctx, "creating instance", "name", req.Name, "image", req.Image, "vcpus", req.Vcpus)
-	
+
+	// Start tracing span if tracer is available
+	if m.metrics != nil && m.metrics.tracer != nil {
+		var span trace.Span
+		ctx, span = m.metrics.tracer.Start(ctx, "CreateInstance")
+		defer span.End()
+	}
+
 	// 1. Validate request
 	if err := validateCreateRequest(req); err != nil {
 		log.ErrorContext(ctx, "invalid create request", "error", err)
@@ -336,6 +345,12 @@ func (m *manager) createInstance(
 	// Success - release cleanup stack (prevent cleanup)
 	cu.Release()
 
+	// Record metrics
+	if m.metrics != nil {
+		m.recordDuration(ctx, m.metrics.createDuration, start, "success")
+		m.recordStateTransition(ctx, "stopped", string(StateRunning))
+	}
+
 	// Return instance with derived state
 	finalInst := m.toInstance(ctx, meta)
 	log.InfoContext(ctx, "instance created successfully", "id", id, "name", req.Name, "state", finalInst.State)
@@ -452,14 +467,14 @@ func (m *manager) startAndBootVM(
 	netConfig *network.NetworkConfig,
 ) error {
 	log := logger.FromContext(ctx)
-	
+
 	// Start VMM process and capture PID
 	log.DebugContext(ctx, "starting VMM process", "id", stored.Id, "version", stored.CHVersion)
 	pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath)
 	if err != nil {
 		return fmt.Errorf("start vmm: %w", err)
 	}
-	
+
 	// Store the PID for later cleanup
 	stored.CHPID = &pid
 	log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid)
@@ -539,7 +554,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
 		BootVcpus: inst.Vcpus,
 		MaxVcpus:  inst.Vcpus,
 	}
-	
+
 	// Calculate and set guest topology based on host topology
 	if topology := calculateGuestTopology(inst.Vcpus, m.hostTopology); topology != nil {
 		cpus.Topology = topology
@@ -551,7 +566,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
 	}
 	if inst.HotplugSize > 0 {
 		memory.HotplugSize = &inst.HotplugSize
-		memory.HotplugMethod = ptr("VirtioMem")  // PascalCase, not kebab-case
+		memory.HotplugMethod = ptr("VirtioMem") // PascalCase, not kebab-case
 	}
 
 	// Disk configuration
@@ -644,4 +659,3 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
 func ptr[T any](v T) *T {
 	return &v
 }
-
diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go
index ef0ba8f0..cee4e71b 100644
--- a/lib/instances/exec_test.go
+++ b/lib/instances/exec_test.go
@@ -46,7 +46,7 @@ func TestExecConcurrent(t *testing.T) {
 	p := paths.New(tmpDir)
 
 	// Setup image
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	t.Log("Pulling nginx:alpine image...")
@@ -215,4 +215,3 @@ func TestExecConcurrent(t *testing.T) {
 
 	t.Logf("Long-running streams completed in %v (concurrent OK)", streamElapsed)
 }
-
diff --git a/lib/instances/manager.go b/lib/instances/manager.go
index 0c939a5c..68e9b36c 100644
--- a/lib/instances/manager.go
+++ b/lib/instances/manager.go
@@ -4,12 +4,16 @@ import (
 	"context"
 	"fmt"
 	"sync"
+	"time"
 
 	"github.com/onkernel/hypeman/lib/images"
 	"github.com/onkernel/hypeman/lib/network"
 	"github.com/onkernel/hypeman/lib/paths"
 	"github.com/onkernel/hypeman/lib/system"
 	"github.com/onkernel/hypeman/lib/volumes"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
 )
 
 type Manager interface {
@@ -35,6 +39,15 @@ type ResourceLimits struct {
 	MaxTotalMemory       int64 // Maximum total memory in bytes across all instances (0 = unlimited)
 }
 
+// Metrics holds the metrics instruments for instance operations.
+type Metrics struct {
+	createDuration   metric.Float64Histogram
+	restoreDuration  metric.Float64Histogram
+	standbyDuration  metric.Float64Histogram
+	stateTransitions metric.Int64Counter
+	tracer           trace.Tracer
+}
+
 type manager struct {
 	paths          *paths.Paths
 	imageManager   images.Manager
@@ -44,11 +57,13 @@ type manager struct {
 	limits         ResourceLimits
 	instanceLocks  sync.Map      // map[string]*sync.RWMutex - per-instance locks
 	hostTopology   *HostTopology // Cached host CPU topology
+	metrics        *Metrics
 }
 
-// NewManager creates a new instances manager
-func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits) Manager {
-	return &manager{
+// NewManager creates a new instances manager.
+// If meter is nil, metrics are disabled.
+func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, volumeManager volumes.Manager, limits ResourceLimits, meter metric.Meter, tracer trace.Tracer) Manager {
+	m := &manager{
 		paths:          p,
 		imageManager:   imageManager,
 		systemManager:  systemManager,
@@ -58,6 +73,115 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste
 		instanceLocks:  sync.Map{},
 		hostTopology:   detectHostTopology(), // Detect and cache host topology
 	}
+
+	// Initialize metrics if meter is provided
+	if meter != nil {
+		metrics, err := newInstanceMetrics(meter, tracer, m)
+		if err == nil {
+			m.metrics = metrics
+		}
+	}
+
+	return m
+}
+
+// newInstanceMetrics creates and registers all instance metrics.
+func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) {
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_instances_create_duration_seconds",
+		metric.WithDescription("Time to create an instance"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	restoreDuration, err := meter.Float64Histogram(
+		"hypeman_instances_restore_duration_seconds",
+		metric.WithDescription("Time to restore an instance from standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	standbyDuration, err := meter.Float64Histogram(
+		"hypeman_instances_standby_duration_seconds",
+		metric.WithDescription("Time to put an instance in standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	stateTransitions, err := meter.Int64Counter(
+		"hypeman_instances_state_transitions_total",
+		metric.WithDescription("Total number of instance state transitions"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauge for instance counts by state
+	instancesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_instances_total",
+		metric.WithDescription("Total number of instances by state"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			instances, err := m.listInstances(ctx)
+			if err != nil {
+				return nil
+			}
+			stateCounts := make(map[string]int64)
+			for _, inst := range instances {
+				stateCounts[string(inst.State)]++
+			}
+			for state, count := range stateCounts {
+				o.ObserveInt64(instancesTotal, count,
+					metric.WithAttributes(attribute.String("state", state)))
+			}
+			return nil
+		},
+		instancesTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		createDuration:   createDuration,
+		restoreDuration:  restoreDuration,
+		standbyDuration:  standbyDuration,
+		stateTransitions: stateTransitions,
+		tracer:           tracer,
+	}, nil
+}
+
+// recordDuration records operation duration.
+func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	histogram.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+}
+
+// recordStateTransition records a state transition.
+func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.stateTransitions.Add(ctx, 1,
+		metric.WithAttributes(
+			attribute.String("from", fromState),
+			attribute.String("to", toState),
+		))
 }
 
 // getInstanceLock returns or creates a lock for a specific instance
@@ -81,7 +205,7 @@ func (m *manager) DeleteInstance(ctx context.Context, id string) error {
 	lock := m.getInstanceLock(id)
 	lock.Lock()
 	defer lock.Unlock()
-	
+
 	err := m.deleteInstance(ctx, id)
 	if err == nil {
 		// Clean up the lock after successful deletion
diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go
index 4b533e0a..8534cdf4 100644
--- a/lib/instances/manager_test.go
+++ b/lib/instances/manager_test.go
@@ -26,21 +26,21 @@ import (
 // setupTestManager creates a manager and registers cleanup for any orphaned processes
 func setupTestManager(t *testing.T) (*manager, string) {
 	tmpDir := t.TempDir()
-	
+
 	cfg := &config.Config{
-		DataDir:       tmpDir,
-		BridgeName:    "vmbr0",
+		DataDir:    tmpDir,
+		BridgeName: "vmbr0",
 		SubnetCIDR: "10.100.0.0/16",
-		DNSServer:     "1.1.1.1",
+		DNSServer:  "1.1.1.1",
 	}
-	
+
 	p := paths.New(tmpDir)
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
-	
+
 	systemManager := system.NewManager(p)
-	networkManager := network.NewManager(p, cfg)
-	volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage
+	networkManager := network.NewManager(p, cfg, nil)
+	volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
 	limits := ResourceLimits{
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
 		MaxVcpusPerInstance:  0,                        // unlimited
@@ -48,20 +48,20 @@ func setupTestManager(t *testing.T) (*manager, string) {
 		MaxTotalVcpus:        0,                        // unlimited
 		MaxTotalMemory:       0,                        // unlimited
 	}
-	mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager)
-	
+	mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager)
+
 	// Register cleanup to kill any orphaned Cloud Hypervisor processes
 	t.Cleanup(func() {
 		cleanupOrphanedProcesses(t, mgr)
 	})
-	
+
 	return mgr, tmpDir
 }
 
 // waitForVMReady polls VM state via VMM API until it's running or times out
 func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duration) error {
 	deadline := time.Now().Add(timeout)
-	
+
 	for time.Now().Before(deadline) {
 		// Try to connect to VMM
 		client, err := vmm.NewVMM(socketPath)
@@ -70,48 +70,48 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio
 			time.Sleep(100 * time.Millisecond)
 			continue
 		}
-		
+
 		// Get VM info
 		infoResp, err := client.GetVmInfoWithResponse(ctx)
 		if err != nil {
 			time.Sleep(100 * time.Millisecond)
 			continue
 		}
-		
+
 		if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil {
 			time.Sleep(100 * time.Millisecond)
 			continue
 		}
-		
+
 		// Check if VM is running
 		if infoResp.JSON200.State == vmm.Running {
 			return nil
 		}
-		
+
 		time.Sleep(100 * time.Millisecond)
 	}
-	
+
 	return fmt.Errorf("VM did not reach running state within %v", timeout)
 }
 
 // waitForLogMessage polls instance logs until the message appears or times out
 func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error {
 	deadline := time.Now().Add(timeout)
-	
+
 	for time.Now().Before(deadline) {
 		logs, err := collectLogs(ctx, mgr, instanceID, 200)
 		if err != nil {
 			time.Sleep(100 * time.Millisecond)
 			continue
 		}
-		
+
 		if strings.Contains(logs, message) {
 			return nil
 		}
-		
+
 		time.Sleep(100 * time.Millisecond)
 	}
-	
+
 	return fmt.Errorf("message %q not found in logs within %v", message, timeout)
 }
 
@@ -121,12 +121,12 @@ func collectLogs(ctx context.Context, mgr *manager, instanceID string, n int) (s
 	if err != nil {
 		return "", err
 	}
-	
+
 	var lines []string
 	for line := range logChan {
 		lines = append(lines, line)
 	}
-	
+
 	return strings.Join(lines, "\n"), nil
 }
 
@@ -137,26 +137,26 @@ func cleanupOrphanedProcesses(t *testing.T, mgr *manager) {
 	if err != nil {
 		return // No metadata files, nothing to clean
 	}
-	
+
 	for _, metaFile := range metaFiles {
 		// Extract instance ID from path
 		id := filepath.Base(filepath.Dir(metaFile))
-		
+
 		// Load metadata
 		meta, err := mgr.loadMetadata(id)
 		if err != nil {
 			continue
 		}
-		
+
 		// If metadata has a PID, try to kill it
 		if meta.CHPID != nil {
 			pid := *meta.CHPID
-			
+
 			// Check if process exists
 			if err := syscall.Kill(pid, 0); err == nil {
 				t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d (instance %s)", pid, id)
 				syscall.Kill(pid, syscall.SIGKILL)
-				
+
 				// Wait for process to exit
 				WaitForProcessExit(pid, 1*time.Second)
 			}
@@ -174,7 +174,7 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 	ctx := context.Background()
 
 	// Get the image manager from the manager (we need it for image operations)
-	imageManager, err := images.NewManager(paths.New(tmpDir), 1)
+	imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil)
 	require.NoError(t, err)
 
 	// Pull nginx image (runs a daemon, won't exit)
@@ -210,7 +210,7 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 
 	// Create a volume to attach
 	p := paths.New(tmpDir)
-	volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage
+	volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
 	t.Log("Creating volume...")
 	vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{
 		Name:   "test-data",
@@ -226,12 +226,12 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 
 	// Create instance with real nginx image and attached volume
 	req := CreateInstanceRequest{
-		Name:        "test-nginx",
-		Image:       "docker.io/library/nginx:alpine",
-		Size:        512 * 1024 * 1024,      // 512MB
-		HotplugSize: 512 * 1024 * 1024,      // 512MB
-		OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB
-		Vcpus:       1,
+		Name:           "test-nginx",
+		Image:          "docker.io/library/nginx:alpine",
+		Size:           512 * 1024 * 1024,       // 512MB
+		HotplugSize:    512 * 1024 * 1024,       // 512MB
+		OverlaySize:    10 * 1024 * 1024 * 1024, // 10GB
+		Vcpus:          1,
 		NetworkEnabled: false, // No network for tests
 		Env: map[string]string{
 			"TEST_VAR": "test_value",
@@ -299,33 +299,33 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 	for i := 0; i < 50; i++ { // Poll for up to 5 seconds (50 * 100ms)
 		logs, err = collectLogs(ctx, manager, inst.Id, 100)
 		require.NoError(t, err)
-		
+
 		if strings.Contains(logs, "start worker processes") {
 			foundNginxStartup = true
 			break
 		}
 		time.Sleep(100 * time.Millisecond)
 	}
-	
+
 	t.Logf("Instance logs (last 100 lines):\n%s", logs)
-	
+
 	// Verify nginx started successfully
 	assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 5 seconds")
 
 	// Test volume is accessible from inside the guest via exec
 	t.Log("Testing volume from inside guest via exec...")
-	
+
 	// Helper to run command in guest with retry (exec agent may need time between connections)
 	runCmd := func(command ...string) (string, int, error) {
 		var lastOutput string
 		var lastExitCode int
 		var lastErr error
-		
+
 		for attempt := 0; attempt < 5; attempt++ {
 			if attempt > 0 {
 				time.Sleep(200 * time.Millisecond)
 			}
-			
+
 			var stdout, stderr bytes.Buffer
 			exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{
 				Command: command,
@@ -333,31 +333,31 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 				Stderr:  &stderr,
 				TTY:     false,
 			})
-			
+
 			// Combine stdout and stderr
 			output := stdout.String()
 			if stderr.Len() > 0 {
 				output += stderr.String()
 			}
 			output = strings.TrimSpace(output)
-			
+
 			if err != nil {
 				lastErr = err
 				lastOutput = output
 				lastExitCode = -1
 				continue
 			}
-			
+
 			lastOutput = output
 			lastExitCode = exit.Code
 			lastErr = nil
-			
+
 			// Success if we got output or it's a command expected to have no output
 			if output != "" || exit.Code == 0 {
 				return output, exit.Code, nil
 			}
 		}
-		
+
 		return lastOutput, lastExitCode, lastErr
 	}
 
@@ -375,11 +375,11 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 		echo "=== Volume mount info ==="
 		df -h /mnt/data
 	`, testContent)
-	
+
 	output, exitCode, err := runCmd("sh", "-c", script)
 	require.NoError(t, err, "Volume test script should execute")
 	require.Equal(t, 0, exitCode, "Volume test script should succeed")
-	
+
 	// Verify all expected output is present
 	require.Contains(t, output, "lost+found", "Volume should be ext4-formatted")
 	require.Contains(t, output, testContent, "Should be able to read written content")
@@ -455,7 +455,7 @@ func TestCreateAndDeleteInstance(t *testing.T) {
 	// Verify volume is gone
 	_, err = volumeManager.GetVolume(ctx, vol.Id)
 	assert.ErrorIs(t, err, volumes.ErrNotFound)
-	
+
 	t.Log("Instance and volume lifecycle test complete!")
 }
 
@@ -464,17 +464,17 @@ func TestStorageOperations(t *testing.T) {
 	tmpDir := t.TempDir()
 
 	cfg := &config.Config{
-		DataDir:       tmpDir,
-		BridgeName:    "vmbr0",
+		DataDir:    tmpDir,
+		BridgeName: "vmbr0",
 		SubnetCIDR: "10.100.0.0/16",
-		DNSServer:     "1.1.1.1",
+		DNSServer:  "1.1.1.1",
 	}
 
 	p := paths.New(tmpDir)
-	imageManager, _ := images.NewManager(p, 1)
+	imageManager, _ := images.NewManager(p, 1, nil)
 	systemManager := system.NewManager(p)
-	networkManager := network.NewManager(p, cfg)
-	volumeManager := volumes.NewManager(p, 0) // 0 = unlimited storage
+	networkManager := network.NewManager(p, cfg, nil)
+	volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
 	limits := ResourceLimits{
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
 		MaxVcpusPerInstance:  0,                        // unlimited
@@ -482,7 +482,7 @@ func TestStorageOperations(t *testing.T) {
 		MaxTotalVcpus:        0,                        // unlimited
 		MaxTotalMemory:       0,                        // unlimited
 	}
-	manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager)
+	manager := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager)
 
 	// Test metadata doesn't exist initially
 	_, err := manager.loadMetadata("nonexistent")
@@ -544,7 +544,7 @@ func TestStandbyAndRestore(t *testing.T) {
 	ctx := context.Background()
 
 	// Create image manager for pulling nginx
-	imageManager, err := images.NewManager(paths.New(tmpDir), 1)
+	imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil)
 	require.NoError(t, err)
 
 	// Pull nginx image (reuse if already pulled in previous test)
@@ -577,14 +577,14 @@ func TestStandbyAndRestore(t *testing.T) {
 	// Create instance
 	t.Log("Creating instance...")
 	req := CreateInstanceRequest{
-		Name:        "test-standby",
-		Image:       "docker.io/library/nginx:alpine",
-		Size:        512 * 1024 * 1024,
-		HotplugSize: 512 * 1024 * 1024,
-		OverlaySize: 10 * 1024 * 1024 * 1024,
-		Vcpus:       1,
+		Name:           "test-standby",
+		Image:          "docker.io/library/nginx:alpine",
+		Size:           512 * 1024 * 1024,
+		HotplugSize:    512 * 1024 * 1024,
+		OverlaySize:    10 * 1024 * 1024 * 1024,
+		Vcpus:          1,
 		NetworkEnabled: false, // No network for tests
-		Env:         map[string]string{},
+		Env:            map[string]string{},
 	}
 
 	inst, err := manager.CreateInstance(ctx, req)
@@ -610,7 +610,7 @@ func TestStandbyAndRestore(t *testing.T) {
 	assert.DirExists(t, snapshotDir)
 	assert.FileExists(t, filepath.Join(snapshotDir, "memory-ranges"))
 	// Cloud Hypervisor creates various snapshot files, just verify directory exists
-	
+
 	// DEBUG: Check snapshot files (for comparison with networking test)
 	t.Log("DEBUG: Snapshot files for non-network instance:")
 	entries, _ := os.ReadDir(snapshotDir)
@@ -618,7 +618,7 @@ func TestStandbyAndRestore(t *testing.T) {
 		info, _ := entry.Info()
 		t.Logf("  - %s (size: %d bytes)", entry.Name(), info.Size())
 	}
-	
+
 	// DEBUG: Check console.log file size before restore
 	consoleLogPath := filepath.Join(tmpDir, "guests", inst.Id, "logs", "console.log")
 	var consoleLogSizeBefore int64
@@ -633,7 +633,7 @@ func TestStandbyAndRestore(t *testing.T) {
 	require.NoError(t, err)
 	assert.Equal(t, StateRunning, inst.State)
 	t.Log("Instance restored and running")
-	
+
 	// DEBUG: Check console.log file size after restore
 	if info, err := os.Stat(consoleLogPath); err == nil {
 		consoleLogSizeAfter := info.Size()
@@ -648,7 +648,7 @@ func TestStandbyAndRestore(t *testing.T) {
 	t.Log("Cleaning up...")
 	err = manager.DeleteInstance(ctx, inst.Id)
 	require.NoError(t, err)
-	
+
 	t.Log("Standby/restore test complete!")
 }
 
@@ -685,6 +685,4 @@ func TestStateTransitions(t *testing.T) {
 	}
 }
 
-
 // No mock image manager needed - tests use real images!
-
diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go
index ac27b561..83930421 100644
--- a/lib/instances/resource_limits_test.go
+++ b/lib/instances/resource_limits_test.go
@@ -154,23 +154,23 @@ func createTestManager(t *testing.T, limits ResourceLimits) *manager {
 	cfg := &config.Config{DataDir: tmpDir}
 	p := paths.New(cfg.DataDir)
 
-	imageMgr, err := images.NewManager(p, 1)
+	imageMgr, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	systemMgr := system.NewManager(p)
-	networkMgr := network.NewManager(p, cfg)
-	volumeMgr := volumes.NewManager(p, 0)
+	networkMgr := network.NewManager(p, cfg, nil)
+	volumeMgr := volumes.NewManager(p, 0, nil)
 
-	return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits).(*manager)
+	return NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil).(*manager)
 }
 
 func TestResourceLimits_StructValues(t *testing.T) {
 	limits := ResourceLimits{
-		MaxOverlaySize:       10 * 1024 * 1024 * 1024,  // 10GB
+		MaxOverlaySize:       10 * 1024 * 1024 * 1024, // 10GB
 		MaxVcpusPerInstance:  4,
-		MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024,   // 8GB
+		MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB
 		MaxTotalVcpus:        16,
-		MaxTotalMemory:       32 * 1024 * 1024 * 1024,  // 32GB
+		MaxTotalMemory:       32 * 1024 * 1024 * 1024, // 32GB
 	}
 
 	assert.Equal(t, int64(10*1024*1024*1024), limits.MaxOverlaySize)
@@ -246,12 +246,12 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) {
 	}
 
 	p := paths.New(tmpDir)
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	systemManager := system.NewManager(p)
-	networkManager := network.NewManager(p, cfg)
-	volumeManager := volumes.NewManager(p, 0)
+	networkManager := network.NewManager(p, cfg, nil)
+	volumeManager := volumes.NewManager(p, 0, nil)
 
 	// Set small aggregate limits:
 	// - MaxTotalVcpus: 2 (first VM gets 1, second wants 2 -> denied)
@@ -264,7 +264,7 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) {
 		MaxTotalMemory:       2 * 1024 * 1024 * 1024,   // aggregate: only 2GB total
 	}
 
-	mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits).(*manager)
+	mgr := NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil).(*manager)
 
 	// Cleanup any orphaned processes on test end
 	t.Cleanup(func() {
@@ -375,4 +375,3 @@ func cleanupTestProcesses(t *testing.T, mgr *manager) {
 		}
 	}
 }
-
diff --git a/lib/instances/restore.go b/lib/instances/restore.go
index ffce3d36..53b7b3e9 100644
--- a/lib/instances/restore.go
+++ b/lib/instances/restore.go
@@ -8,18 +8,27 @@ import (
 
 	"github.com/onkernel/hypeman/lib/logger"
 	"github.com/onkernel/hypeman/lib/vmm"
+	"go.opentelemetry.io/otel/trace"
 )
 
 // RestoreInstance restores an instance from standby
 // Multi-hop orchestration: Standby → Paused → Running
 func (m *manager) restoreInstance(
 	ctx context.Context,
-	
+
 	id string,
 ) (*Instance, error) {
+	start := time.Now()
 	log := logger.FromContext(ctx)
 	log.InfoContext(ctx, "restoring instance from standby", "id", id)
-	
+
+	// Start tracing span if tracer is available
+	if m.metrics != nil && m.metrics.tracer != nil {
+		var span trace.Span
+		ctx, span = m.metrics.tracer.Start(ctx, "RestoreInstance")
+		defer span.End()
+	}
+
 	// 1. Load instance
 	meta, err := m.loadMetadata(id)
 	if err != nil {
@@ -108,6 +117,12 @@ func (m *manager) restoreInstance(
 		log.WarnContext(ctx, "failed to update metadata after restore", "id", id, "error", err)
 	}
 
+	// Record metrics
+	if m.metrics != nil {
+		m.recordDuration(ctx, m.metrics.restoreDuration, start, "success")
+		m.recordStateTransition(ctx, string(StateStandby), string(StateRunning))
+	}
+
 	// Return instance with derived state (should be Running now)
 	finalInst := m.toInstance(ctx, meta)
 	log.InfoContext(ctx, "instance restored successfully", "id", id, "state", finalInst.State)
@@ -121,14 +136,14 @@ func (m *manager) restoreFromSnapshot(
 	snapshotDir string,
 ) error {
 	log := logger.FromContext(ctx)
-	
+
 	// Start VMM process and capture PID
 	log.DebugContext(ctx, "starting VMM process for restore", "id", stored.Id, "version", stored.CHVersion)
 	pid, err := vmm.StartProcess(ctx, m.paths, stored.CHVersion, stored.SocketPath)
 	if err != nil {
 		return fmt.Errorf("start vmm: %w", err)
 	}
-	
+
 	// Store the PID for later cleanup
 	stored.CHPID = &pid
 	log.DebugContext(ctx, "VMM process started", "id", stored.Id, "pid", pid)
@@ -162,5 +177,3 @@ func (m *manager) restoreFromSnapshot(
 	log.DebugContext(ctx, "VM restored from snapshot successfully", "id", stored.Id)
 	return nil
 }
-
-
diff --git a/lib/instances/standby.go b/lib/instances/standby.go
index ad745164..68e67661 100644
--- a/lib/instances/standby.go
+++ b/lib/instances/standby.go
@@ -9,18 +9,27 @@ import (
 	"github.com/onkernel/hypeman/lib/logger"
 	"github.com/onkernel/hypeman/lib/network"
 	"github.com/onkernel/hypeman/lib/vmm"
+	"go.opentelemetry.io/otel/trace"
 )
 
 // StandbyInstance puts an instance in standby state
 // Multi-hop orchestration: Running → Paused → Standby
 func (m *manager) standbyInstance(
 	ctx context.Context,
-	
+
 	id string,
 ) (*Instance, error) {
+	start := time.Now()
 	log := logger.FromContext(ctx)
 	log.InfoContext(ctx, "putting instance in standby", "id", id)
-	
+
+	// Start tracing span if tracer is available
+	if m.metrics != nil && m.metrics.tracer != nil {
+		var span trace.Span
+		ctx, span = m.metrics.tracer.Start(ctx, "StandbyInstance")
+		defer span.End()
+	}
+
 	// 1. Load instance
 	meta, err := m.loadMetadata(id)
 	if err != nil {
@@ -110,6 +119,12 @@ func (m *manager) standbyInstance(
 		return nil, fmt.Errorf("save metadata: %w", err)
 	}
 
+	// Record metrics
+	if m.metrics != nil {
+		m.recordDuration(ctx, m.metrics.standbyDuration, start, "success")
+		m.recordStateTransition(ctx, string(StateRunning), string(StateStandby))
+	}
+
 	// Return instance with derived state (should be Standby now)
 	finalInst := m.toInstance(ctx, meta)
 	log.InfoContext(ctx, "instance put in standby successfully", "id", id, "state", finalInst.State)
@@ -131,34 +146,34 @@ func reduceMemory(ctx context.Context, client *vmm.VMM, targetBytes int64) error
 func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeout time.Duration) error {
 	log := logger.FromContext(ctx)
 	deadline := time.Now().Add(timeout)
-	
+
 	// Use 20ms for fast response with minimal overhead
 	const pollInterval = 20 * time.Millisecond
 	const stabilityThreshold = 3 // Memory unchanged for 3 checks = stable
-	
+
 	var previousSize *int64
 	unchangedCount := 0
-	
+
 	for time.Now().Before(deadline) {
 		infoResp, err := client.GetVmInfoWithResponse(ctx)
 		if err != nil {
 			time.Sleep(pollInterval)
 			continue
 		}
-		
+
 		if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil {
 			time.Sleep(pollInterval)
 			continue
 		}
-		
+
 		actualSize := infoResp.JSON200.MemoryActualSize
 		if actualSize == nil {
 			time.Sleep(pollInterval)
 			continue
 		}
-		
+
 		currentSize := *actualSize
-		
+
 		// Best case: reached target or below
 		if currentSize <= targetBytes {
 			log.DebugContext(ctx, "memory reduced to target",
@@ -166,7 +181,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo
 				"actual_mb", currentSize/(1024*1024))
 			return nil
 		}
-		
+
 		// Check if memory has stopped shrinking (stabilized above target)
 		if previousSize != nil {
 			if currentSize == *previousSize {
@@ -185,11 +200,11 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo
 				unchangedCount = 0
 			}
 		}
-		
+
 		previousSize = &currentSize
 		time.Sleep(pollInterval)
 	}
-	
+
 	// Timeout - memory never stabilized
 	return fmt.Errorf("memory reduction did not complete within %v", timeout)
 }
@@ -197,7 +212,7 @@ func pollVMMemory(ctx context.Context, client *vmm.VMM, targetBytes int64, timeo
 // createSnapshot creates a Cloud Hypervisor snapshot
 func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) error {
 	log := logger.FromContext(ctx)
-	
+
 	// Remove old snapshot
 	os.RemoveAll(snapshotDir)
 
@@ -227,7 +242,7 @@ func createSnapshot(ctx context.Context, client *vmm.VMM, snapshotDir string) er
 // shutdownVMM gracefully shuts down the VMM process via API
 func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error {
 	log := logger.FromContext(ctx)
-	
+
 	// Try to connect to VMM
 	client, err := vmm.NewVMM(inst.SocketPath)
 	if err != nil {
@@ -239,7 +254,7 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error {
 	// Try graceful shutdown
 	log.DebugContext(ctx, "sending shutdown command to VMM", "id", inst.Id)
 	client.ShutdownVMMWithResponse(ctx)
-	
+
 	// Wait for process to exit
 	if inst.CHPID != nil {
 		if !WaitForProcessExit(*inst.CHPID, 2*time.Second) {
@@ -248,7 +263,6 @@ func (m *manager) shutdownVMM(ctx context.Context, inst *Instance) error {
 			log.DebugContext(ctx, "VMM shutdown gracefully", "id", inst.Id, "pid", *inst.CHPID)
 		}
 	}
-	
+
 	return nil
 }
-
diff --git a/lib/instances/volumes_test.go b/lib/instances/volumes_test.go
index c8a36816..5e46d007 100644
--- a/lib/instances/volumes_test.go
+++ b/lib/instances/volumes_test.go
@@ -67,7 +67,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) {
 	p := paths.New(tmpDir)
 
 	// Setup: prepare image and system files
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	t.Log("Pulling alpine image...")
@@ -92,7 +92,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) {
 	t.Log("System files ready")
 
 	// Create volume
-	volumeManager := volumes.NewManager(p, 0)
+	volumeManager := volumes.NewManager(p, 0, nil)
 	t.Log("Creating volume...")
 	vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{
 		Name:   "shared-data",
@@ -250,7 +250,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) {
 	p := paths.New(tmpDir)
 
 	// Setup: prepare image and system files
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	t.Log("Pulling alpine image...")
@@ -272,7 +272,7 @@ func TestOverlayDiskCleanupOnDelete(t *testing.T) {
 	require.NoError(t, err)
 
 	// Create volume
-	volumeManager := volumes.NewManager(p, 0)
+	volumeManager := volumes.NewManager(p, 0, nil)
 	vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{
 		Name:   "cleanup-test-vol",
 		SizeGb: 1,
@@ -359,7 +359,7 @@ func TestVolumeFromArchive(t *testing.T) {
 	p := paths.New(tmpDir)
 
 	// Setup: prepare image and system files
-	imageManager, err := images.NewManager(p, 1)
+	imageManager, err := images.NewManager(p, 1, nil)
 	require.NoError(t, err)
 
 	t.Log("Pulling alpine image...")
@@ -393,7 +393,7 @@ func TestVolumeFromArchive(t *testing.T) {
 	archive := createTestTarGz(t, testFiles)
 
 	// Create volume from archive
-	volumeManager := volumes.NewManager(p, 0)
+	volumeManager := volumes.NewManager(p, 0, nil)
 	t.Log("Creating volume from archive...")
 	vol, err := volumeManager.CreateVolumeFromArchive(ctx, volumes.CreateVolumeFromArchiveRequest{
 		Name:   "archive-data",
diff --git a/lib/logger/README.md b/lib/logger/README.md
new file mode 100644
index 00000000..4996bfeb
--- /dev/null
+++ b/lib/logger/README.md
@@ -0,0 +1,55 @@
+# Logger
+
+Structured logging with per-subsystem log levels and OpenTelemetry trace context integration.
+
+## Features
+
+- Per-subsystem configurable log levels
+- Automatic trace_id/span_id injection when OTel is active
+- Context-based logger propagation
+- JSON output format
+
+## Configuration
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LOG_LEVEL` | Default log level | `info` |
+| `LOG_LEVEL_<SUBSYSTEM>` | Per-subsystem level | inherits default |
+
+Subsystems: `API`, `IMAGES`, `INSTANCES`, `NETWORK`, `VOLUMES`, `VMM`, `SYSTEM`, `EXEC`
+
+Example:
+```bash
+LOG_LEVEL=info LOG_LEVEL_NETWORK=debug ./hypeman
+```
+
+## Usage
+
+```go
+// Create subsystem-specific logger
+cfg := logger.NewConfig()
+log := logger.NewSubsystemLogger(logger.SubsystemInstances, cfg)
+
+// Add logger to context
+ctx = logger.AddToContext(ctx, log)
+
+// Retrieve from context
+log = logger.FromContext(ctx)
+log.InfoContext(ctx, "instance created", "id", instanceID)
+```
+
+## Output
+
+When OTel tracing is active, logs include trace context:
+
+```json
+{
+  "level": "INFO",
+  "msg": "instance created",
+  "subsystem": "INSTANCES",
+  "trace_id": "abc123...",
+  "span_id": "def456...",
+  "id": "instance-123"
+}
+```
+
diff --git a/lib/logger/logger.go b/lib/logger/logger.go
index b4ac1528..23a69ef3 100644
--- a/lib/logger/logger.go
+++ b/lib/logger/logger.go
@@ -1,20 +1,165 @@
+// Package logger provides structured logging with subsystem-specific levels
+// and OpenTelemetry trace context integration.
 package logger
 
 import (
 	"context"
 	"log/slog"
+	"os"
+	"strings"
+
+	"go.opentelemetry.io/otel/trace"
 )
 
 type contextKey string
 
 const loggerKey contextKey = "logger"
 
-// AddToContext adds a logger to the context
+// Subsystem names for per-subsystem logging configuration.
+const (
+	SubsystemAPI       = "API"
+	SubsystemImages    = "IMAGES"
+	SubsystemInstances = "INSTANCES"
+	SubsystemNetwork   = "NETWORK"
+	SubsystemVolumes   = "VOLUMES"
+	SubsystemVMM       = "VMM"
+	SubsystemSystem    = "SYSTEM"
+	SubsystemExec      = "EXEC"
+)
+
+// Config holds logging configuration.
+type Config struct {
+	// DefaultLevel is the default log level for all subsystems.
+	DefaultLevel slog.Level
+	// SubsystemLevels maps subsystem names to their specific log levels.
+	// If a subsystem is not in this map, DefaultLevel is used.
+	SubsystemLevels map[string]slog.Level
+	// AddSource adds source file information to log entries.
+	AddSource bool
+}
+
+// NewConfig creates a Config from environment variables.
+// Reads LOG_LEVEL for default level and LOG_LEVEL_<SUBSYSTEM> for per-subsystem levels.
+func NewConfig() Config {
+	cfg := Config{
+		DefaultLevel:    slog.LevelInfo,
+		SubsystemLevels: make(map[string]slog.Level),
+		AddSource:       false,
+	}
+
+	// Parse default level
+	if levelStr := os.Getenv("LOG_LEVEL"); levelStr != "" {
+		cfg.DefaultLevel = parseLevel(levelStr)
+	}
+
+	// Parse subsystem-specific levels
+	subsystems := []string{
+		SubsystemAPI, SubsystemImages, SubsystemInstances,
+		SubsystemNetwork, SubsystemVolumes, SubsystemVMM,
+		SubsystemSystem, SubsystemExec,
+	}
+	for _, subsystem := range subsystems {
+		envKey := "LOG_LEVEL_" + subsystem
+		if levelStr := os.Getenv(envKey); levelStr != "" {
+			cfg.SubsystemLevels[subsystem] = parseLevel(levelStr)
+		}
+	}
+
+	return cfg
+}
+
+// parseLevel parses a log level string.
+func parseLevel(s string) slog.Level {
+	switch strings.ToLower(s) {
+	case "debug":
+		return slog.LevelDebug
+	case "info":
+		return slog.LevelInfo
+	case "warn", "warning":
+		return slog.LevelWarn
+	case "error":
+		return slog.LevelError
+	default:
+		return slog.LevelInfo
+	}
+}
+
+// LevelFor returns the log level for the given subsystem.
+func (c Config) LevelFor(subsystem string) slog.Level {
+	if level, ok := c.SubsystemLevels[subsystem]; ok {
+		return level
+	}
+	return c.DefaultLevel
+}
+
+// NewLogger creates a new slog.Logger with JSON output.
+func NewLogger(cfg Config) *slog.Logger {
+	return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
+		Level:     cfg.DefaultLevel,
+		AddSource: cfg.AddSource,
+	}))
+}
+
+// NewSubsystemLogger creates a logger for a specific subsystem with its configured level.
+func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger {
+	level := cfg.LevelFor(subsystem)
+	handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: cfg.AddSource,
+	})
+	// Wrap with trace context handler if we want trace IDs in logs
+	wrappedHandler := &traceContextHandler{
+		Handler:   handler,
+		subsystem: subsystem,
+	}
+	return slog.New(wrappedHandler)
+}
+
+// traceContextHandler wraps a slog.Handler to add trace context and subsystem.
+type traceContextHandler struct {
+	slog.Handler
+	subsystem string
+}
+
+// Handle adds trace_id and span_id from the context if available.
+func (h *traceContextHandler) Handle(ctx context.Context, r slog.Record) error {
+	// Add subsystem attribute
+	r.AddAttrs(slog.String("subsystem", h.subsystem))
+
+	// Add trace context if available
+	spanCtx := trace.SpanContextFromContext(ctx)
+	if spanCtx.IsValid() {
+		r.AddAttrs(
+			slog.String("trace_id", spanCtx.TraceID().String()),
+			slog.String("span_id", spanCtx.SpanID().String()),
+		)
+	}
+
+	return h.Handler.Handle(ctx, r)
+}
+
+// WithAttrs returns a new handler with the given attributes.
+func (h *traceContextHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+	return &traceContextHandler{
+		Handler:   h.Handler.WithAttrs(attrs),
+		subsystem: h.subsystem,
+	}
+}
+
+// WithGroup returns a new handler with the given group name.
+func (h *traceContextHandler) WithGroup(name string) slog.Handler {
+	return &traceContextHandler{
+		Handler:   h.Handler.WithGroup(name),
+		subsystem: h.subsystem,
+	}
+}
+
+// AddToContext adds a logger to the context.
 func AddToContext(ctx context.Context, logger *slog.Logger) context.Context {
 	return context.WithValue(ctx, loggerKey, logger)
 }
 
-// FromContext retrieves the logger from context, or returns default
+// FromContext retrieves the logger from context, or returns default.
 func FromContext(ctx context.Context) *slog.Logger {
 	if logger, ok := ctx.Value(loggerKey).(*slog.Logger); ok {
 		return logger
@@ -22,3 +167,7 @@ func FromContext(ctx context.Context) *slog.Logger {
 	return slog.Default()
 }
 
+// With returns a logger with additional attributes.
+func With(logger *slog.Logger, args ...any) *slog.Logger {
+	return logger.With(args...)
+}
diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go
new file mode 100644
index 00000000..823aef08
--- /dev/null
+++ b/lib/middleware/otel.go
@@ -0,0 +1,99 @@
+package middleware
+
+import (
+	"net/http"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// HTTPMetrics holds the OTel metrics for HTTP requests.
+type HTTPMetrics struct {
+	requestsTotal   metric.Int64Counter
+	requestDuration metric.Float64Histogram
+}
+
+// NewHTTPMetrics creates new HTTP metrics instruments.
+func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) {
+	requestsTotal, err := meter.Int64Counter(
+		"hypeman_http_requests_total",
+		metric.WithDescription("Total number of HTTP requests"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	requestDuration, err := meter.Float64Histogram(
+		"hypeman_http_request_duration_seconds",
+		metric.WithDescription("HTTP request duration in seconds"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &HTTPMetrics{
+		requestsTotal:   requestsTotal,
+		requestDuration: requestDuration,
+	}, nil
+}
+
+// Middleware returns an HTTP middleware that records metrics.
+func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		start := time.Now()
+
+		// Wrap response writer to capture status code
+		wrapped := &responseWriter{
+			ResponseWriter: w,
+			statusCode:     http.StatusOK,
+		}
+
+		// Process request
+		next.ServeHTTP(wrapped, r)
+
+		// Calculate duration
+		duration := time.Since(start).Seconds()
+
+		// Get route pattern if available (chi specific)
+		routePattern := chi.RouteContext(r.Context()).RoutePattern()
+		if routePattern == "" {
+			routePattern = r.URL.Path
+		}
+
+		// Record metrics
+		attrs := []attribute.KeyValue{
+			attribute.String("method", r.Method),
+			attribute.String("path", routePattern),
+			attribute.Int("status", wrapped.statusCode),
+		}
+
+		m.requestsTotal.Add(r.Context(), 1, metric.WithAttributes(attrs...))
+		m.requestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...))
+	})
+}
+
+// responseWriter wraps http.ResponseWriter to capture the status code.
+type responseWriter struct {
+	http.ResponseWriter
+	statusCode int
+}
+
+func (rw *responseWriter) WriteHeader(code int) {
+	rw.statusCode = code
+	rw.ResponseWriter.WriteHeader(code)
+}
+
+// Unwrap provides access to the underlying ResponseWriter for http.ResponseController.
+func (rw *responseWriter) Unwrap() http.ResponseWriter {
+	return rw.ResponseWriter
+}
+
+// NoopHTTPMetrics returns a middleware that does nothing (for when OTel is disabled).
+func NoopHTTPMetrics() func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return next
+	}
+}
diff --git a/lib/network/allocate.go b/lib/network/allocate.go
index 5396627d..b3f35900 100644
--- a/lib/network/allocate.go
+++ b/lib/network/allocate.go
@@ -59,6 +59,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N
 	if err := m.createTAPDevice(tap, network.Bridge, network.Isolated); err != nil {
 		return nil, fmt.Errorf("create TAP device: %w", err)
 	}
+	m.recordTAPOperation(ctx, "create")
 
 	log.InfoContext(ctx, "allocated network",
 		"instance_id", req.InstanceID,
@@ -111,6 +112,7 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string) err
 	if err := m.createTAPDevice(alloc.TAPDevice, network.Bridge, network.Isolated); err != nil {
 		return fmt.Errorf("create TAP device: %w", err)
 	}
+	m.recordTAPOperation(ctx, "create")
 
 	log.InfoContext(ctx, "recreated network for restore",
 		"instance_id", instanceID,
@@ -138,6 +140,8 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro
 	// 1. Delete TAP device (best effort)
 	if err := m.deleteTAPDevice(alloc.TAPDevice); err != nil {
 		log.WarnContext(ctx, "failed to delete TAP device", "tap", alloc.TAPDevice, "error", err)
+	} else {
+		m.recordTAPOperation(ctx, "delete")
 	}
 
 	log.InfoContext(ctx, "released network",
@@ -270,4 +274,3 @@ func generateTAPName(instanceID string) string {
 	}
 	return TAPPrefix + strings.ToLower(shortID)
 }
-
diff --git a/lib/network/manager.go b/lib/network/manager.go
index cb43af07..51e4b13d 100644
--- a/lib/network/manager.go
+++ b/lib/network/manager.go
@@ -9,6 +9,8 @@ import (
 	"github.com/onkernel/hypeman/cmd/api/config"
 	"github.com/onkernel/hypeman/lib/logger"
 	"github.com/onkernel/hypeman/lib/paths"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
 )
 
 // Manager defines the interface for network management
@@ -27,19 +29,84 @@ type Manager interface {
 	NameExists(ctx context.Context, name string) (bool, error)
 }
 
+// Metrics holds the metrics instruments for network operations.
+type Metrics struct {
+	tapOperations metric.Int64Counter
+}
+
 // manager implements the Manager interface
 type manager struct {
-	paths  *paths.Paths
-	config *config.Config
-	mu     sync.Mutex // Protects network allocation operations (IP allocation)
+	paths   *paths.Paths
+	config  *config.Config
+	mu      sync.Mutex // Protects network allocation operations (IP allocation)
+	metrics *Metrics
 }
 
-// NewManager creates a new network manager
-func NewManager(p *paths.Paths, cfg *config.Config) Manager {
-	return &manager{
+// NewManager creates a new network manager.
+// If meter is nil, metrics are disabled.
+func NewManager(p *paths.Paths, cfg *config.Config, meter metric.Meter) Manager {
+	m := &manager{
 		paths:  p,
 		config: cfg,
 	}
+
+	// Initialize metrics if meter is provided
+	if meter != nil {
+		metrics, err := newNetworkMetrics(meter, m)
+		if err == nil {
+			m.metrics = metrics
+		}
+	}
+
+	return m
+}
+
+// newNetworkMetrics creates and registers all network metrics.
+func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	tapOperations, err := meter.Int64Counter(
+		"hypeman_network_tap_operations_total",
+		metric.WithDescription("Total number of TAP device operations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauge for allocations
+	allocationsTotal, err := meter.Int64ObservableGauge(
+		"hypeman_network_allocations_total",
+		metric.WithDescription("Total number of active network allocations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			allocs, err := m.ListAllocations(ctx)
+			if err != nil {
+				return nil
+			}
+			o.ObserveInt64(allocationsTotal, int64(len(allocs)))
+			return nil
+		},
+		allocationsTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		tapOperations: tapOperations,
+	}, nil
+}
+
+// recordTAPOperation records a TAP device operation.
+func (m *manager) recordTAPOperation(ctx context.Context, operation string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.tapOperations.Add(ctx, 1,
+		metric.WithAttributes(attribute.String("operation", operation)))
 }
 
 // Initialize initializes the network manager and creates default network.
@@ -100,4 +167,3 @@ func (m *manager) getDefaultNetwork(ctx context.Context) (*Network, error) {
 		CreatedAt: time.Time{}, // Unknown for default
 	}, nil
 }
-
diff --git a/lib/otel/README.md b/lib/otel/README.md
new file mode 100644
index 00000000..0f0187a7
--- /dev/null
+++ b/lib/otel/README.md
@@ -0,0 +1,92 @@
+# OpenTelemetry
+
+Provides OpenTelemetry initialization and metric definitions for Hypeman.
+
+## Features
+
+- OTLP export for traces and metrics (gRPC)
+- Runtime metrics (Go GC, goroutines, memory)
+- Application-specific metrics per subsystem
+- Graceful degradation (failures don't crash the app)
+
+## Configuration
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `OTEL_ENABLED` | Enable OpenTelemetry | `false` |
+| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` |
+| `OTEL_SERVICE_NAME` | Service name | `hypeman` |
+| `OTEL_INSECURE` | Disable TLS for OTLP | `true` |
+
+## Metrics
+
+### System
+| Metric | Type | Description |
+|--------|------|-------------|
+| `hypeman_uptime_seconds` | gauge | Process uptime |
+| `hypeman_info` | gauge | Build info (version, go_version labels) |
+
+### HTTP
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_http_requests_total` | counter | method, path, status | Total HTTP requests |
+| `hypeman_http_request_duration_seconds` | histogram | method, path, status | Request latency |
+
+### Images
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_images_build_queue_length` | gauge | | Current build queue size |
+| `hypeman_images_build_duration_seconds` | histogram | status | Image build time |
+| `hypeman_images_total` | gauge | status | Cached images count |
+| `hypeman_images_pulls_total` | counter | status | Registry pulls |
+
+### Instances
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_instances_total` | gauge | state | Instances by state |
+| `hypeman_instances_create_duration_seconds` | histogram | status | Create time |
+| `hypeman_instances_restore_duration_seconds` | histogram | status | Restore time |
+| `hypeman_instances_standby_duration_seconds` | histogram | status | Standby time |
+| `hypeman_instances_state_transitions_total` | counter | from, to | State transitions |
+
+### Network
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_network_allocations_total` | gauge | | Active IP allocations |
+| `hypeman_network_tap_operations_total` | counter | operation | TAP create/delete ops |
+
+### Volumes
+| Metric | Type | Description |
+|--------|------|-------------|
+| `hypeman_volumes_total` | gauge | Volume count |
+| `hypeman_volumes_allocated_bytes` | gauge | Total provisioned size |
+| `hypeman_volumes_used_bytes` | gauge | Actual disk space consumed |
+| `hypeman_volumes_create_duration_seconds` | histogram | Creation time |
+
+### VMM
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_vmm_api_duration_seconds` | histogram | operation, status | CH API latency |
+| `hypeman_vmm_api_errors_total` | counter | operation | CH API errors |
+
+### Exec
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `hypeman_exec_sessions_total` | counter | status, exit_code | Exec sessions |
+| `hypeman_exec_duration_seconds` | histogram | status | Command duration |
+| `hypeman_exec_bytes_sent_total` | counter | | Bytes to guest (stdin) |
+| `hypeman_exec_bytes_received_total` | counter | | Bytes from guest (stdout+stderr) |
+
+## Usage
+
+```go
+provider, shutdown, err := otel.Init(ctx, otel.Config{
+    Enabled:     true,
+    Endpoint:    "localhost:4317",
+    ServiceName: "hypeman",
+})
+defer shutdown(ctx)
+
+meter := provider.Meter // Use for creating metrics
+```
+
diff --git a/lib/otel/metrics.go b/lib/otel/metrics.go
new file mode 100644
index 00000000..6532703d
--- /dev/null
+++ b/lib/otel/metrics.go
@@ -0,0 +1,317 @@
+package otel
+
+import (
+	"go.opentelemetry.io/otel/metric"
+)
+
+// ImageMetrics holds metrics for the image manager.
+type ImageMetrics struct {
+	BuildQueueLength metric.Int64ObservableGauge
+	BuildDuration    metric.Float64Histogram
+	ImagesTotal      metric.Int64ObservableGauge
+	PullsTotal       metric.Int64Counter
+}
+
+// NewImageMetrics creates metrics for the image manager.
+func NewImageMetrics(meter metric.Meter) (*ImageMetrics, error) {
+	buildQueueLength, err := meter.Int64ObservableGauge(
+		"hypeman_images_build_queue_length",
+		metric.WithDescription("Current number of images in the build queue"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	buildDuration, err := meter.Float64Histogram(
+		"hypeman_images_build_duration_seconds",
+		metric.WithDescription("Time to build an image"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	imagesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_images_total",
+		metric.WithDescription("Total number of cached images"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	pullsTotal, err := meter.Int64Counter(
+		"hypeman_images_pulls_total",
+		metric.WithDescription("Total number of image pulls from registries"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &ImageMetrics{
+		BuildQueueLength: buildQueueLength,
+		BuildDuration:    buildDuration,
+		ImagesTotal:      imagesTotal,
+		PullsTotal:       pullsTotal,
+	}, nil
+}
+
+// InstanceMetrics holds metrics for the instance manager.
+type InstanceMetrics struct {
+	InstancesTotal   metric.Int64ObservableGauge
+	CreateDuration   metric.Float64Histogram
+	RestoreDuration  metric.Float64Histogram
+	StandbyDuration  metric.Float64Histogram
+	StateTransitions metric.Int64Counter
+}
+
+// NewInstanceMetrics creates metrics for the instance manager.
+func NewInstanceMetrics(meter metric.Meter) (*InstanceMetrics, error) {
+	instancesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_instances_total",
+		metric.WithDescription("Total number of instances by state"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_instances_create_duration_seconds",
+		metric.WithDescription("Time to create an instance"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	restoreDuration, err := meter.Float64Histogram(
+		"hypeman_instances_restore_duration_seconds",
+		metric.WithDescription("Time to restore an instance from standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	standbyDuration, err := meter.Float64Histogram(
+		"hypeman_instances_standby_duration_seconds",
+		metric.WithDescription("Time to put an instance in standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	stateTransitions, err := meter.Int64Counter(
+		"hypeman_instances_state_transitions_total",
+		metric.WithDescription("Total number of instance state transitions"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &InstanceMetrics{
+		InstancesTotal:   instancesTotal,
+		CreateDuration:   createDuration,
+		RestoreDuration:  restoreDuration,
+		StandbyDuration:  standbyDuration,
+		StateTransitions: stateTransitions,
+	}, nil
+}
+
+// NetworkMetrics holds metrics for the network manager.
+type NetworkMetrics struct {
+	AllocationsTotal metric.Int64ObservableGauge
+	TapOperations    metric.Int64Counter
+}
+
+// NewNetworkMetrics creates metrics for the network manager.
+func NewNetworkMetrics(meter metric.Meter) (*NetworkMetrics, error) {
+	allocationsTotal, err := meter.Int64ObservableGauge(
+		"hypeman_network_allocations_total",
+		metric.WithDescription("Total number of active network allocations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	tapOperations, err := meter.Int64Counter(
+		"hypeman_network_tap_operations_total",
+		metric.WithDescription("Total number of TAP device operations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &NetworkMetrics{
+		AllocationsTotal: allocationsTotal,
+		TapOperations:    tapOperations,
+	}, nil
+}
+
+// VolumeMetrics holds metrics for the volume manager.
+type VolumeMetrics struct {
+	VolumesTotal   metric.Int64ObservableGauge
+	AllocatedBytes metric.Int64ObservableGauge
+	UsedBytes      metric.Int64ObservableGauge
+	CreateDuration metric.Float64Histogram
+}
+
+// NewVolumeMetrics creates metrics for the volume manager.
+func NewVolumeMetrics(meter metric.Meter) (*VolumeMetrics, error) {
+	volumesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_total",
+		metric.WithDescription("Total number of volumes"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	allocatedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_allocated_bytes",
+		metric.WithDescription("Total allocated/provisioned volume size in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	usedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_used_bytes",
+		metric.WithDescription("Actual disk space consumed by volumes in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_volumes_create_duration_seconds",
+		metric.WithDescription("Time to create a volume"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &VolumeMetrics{
+		VolumesTotal:   volumesTotal,
+		AllocatedBytes: allocatedBytes,
+		UsedBytes:      usedBytes,
+		CreateDuration: createDuration,
+	}, nil
+}
+
+// ExecMetrics holds metrics for the exec client.
+type ExecMetrics struct {
+	SessionsTotal      metric.Int64Counter
+	Duration           metric.Float64Histogram
+	BytesSentTotal     metric.Int64Counter
+	BytesReceivedTotal metric.Int64Counter
+}
+
+// NewExecMetrics creates metrics for the exec client.
+func NewExecMetrics(meter metric.Meter) (*ExecMetrics, error) {
+	sessionsTotal, err := meter.Int64Counter(
+		"hypeman_exec_sessions_total",
+		metric.WithDescription("Total number of exec sessions"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	duration, err := meter.Float64Histogram(
+		"hypeman_exec_duration_seconds",
+		metric.WithDescription("Exec command duration"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	bytesSentTotal, err := meter.Int64Counter(
+		"hypeman_exec_bytes_sent_total",
+		metric.WithDescription("Total bytes sent to guest (stdin)"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	bytesReceivedTotal, err := meter.Int64Counter(
+		"hypeman_exec_bytes_received_total",
+		metric.WithDescription("Total bytes received from guest (stdout+stderr)"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &ExecMetrics{
+		SessionsTotal:      sessionsTotal,
+		Duration:           duration,
+		BytesSentTotal:     bytesSentTotal,
+		BytesReceivedTotal: bytesReceivedTotal,
+	}, nil
+}
+
+// VMMMetrics holds metrics for the VMM client.
+type VMMMetrics struct {
+	APIDuration    metric.Float64Histogram
+	APIErrorsTotal metric.Int64Counter
+}
+
+// NewVMMMetrics creates metrics for the VMM client.
+func NewVMMMetrics(meter metric.Meter) (*VMMMetrics, error) {
+	apiDuration, err := meter.Float64Histogram(
+		"hypeman_vmm_api_duration_seconds",
+		metric.WithDescription("Cloud Hypervisor API call duration"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	apiErrorsTotal, err := meter.Int64Counter(
+		"hypeman_vmm_api_errors_total",
+		metric.WithDescription("Total number of Cloud Hypervisor API errors"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &VMMMetrics{
+		APIDuration:    apiDuration,
+		APIErrorsTotal: apiErrorsTotal,
+	}, nil
+}
+
+// HTTPMetrics holds metrics for HTTP middleware.
+type HTTPMetrics struct {
+	RequestsTotal   metric.Int64Counter
+	RequestDuration metric.Float64Histogram
+}
+
+// NewHTTPMetrics creates metrics for HTTP middleware.
+func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) {
+	requestsTotal, err := meter.Int64Counter(
+		"hypeman_http_requests_total",
+		metric.WithDescription("Total number of HTTP requests"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	requestDuration, err := meter.Float64Histogram(
+		"hypeman_http_request_duration_seconds",
+		metric.WithDescription("HTTP request duration"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &HTTPMetrics{
+		RequestsTotal:   requestsTotal,
+		RequestDuration: requestDuration,
+	}, nil
+}
diff --git a/lib/otel/otel.go b/lib/otel/otel.go
new file mode 100644
index 00000000..7c3e8f20
--- /dev/null
+++ b/lib/otel/otel.go
@@ -0,0 +1,214 @@
+// Package otel provides OpenTelemetry initialization and configuration.
+package otel
+
+import (
+	"context"
+	"fmt"
+	goruntime "runtime"
+	"time"
+
+	otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
+	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+	"go.opentelemetry.io/otel/sdk/resource"
+	sdktrace "go.opentelemetry.io/otel/sdk/trace"
+	semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
+	"go.opentelemetry.io/otel/trace"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+)
+
+// Config holds OpenTelemetry configuration.
+type Config struct {
+	Enabled     bool
+	Endpoint    string
+	ServiceName string
+	Insecure    bool
+	Version     string
+}
+
+// Provider holds initialized OTel providers.
+type Provider struct {
+	TracerProvider *sdktrace.TracerProvider
+	MeterProvider  *sdkmetric.MeterProvider
+	Tracer         trace.Tracer
+	Meter          metric.Meter
+	startTime      time.Time
+}
+
+// Init initializes OpenTelemetry with the given configuration.
+// Returns a shutdown function that should be called on application exit.
+// If OTel is disabled, returns a no-op shutdown function.
+func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) error, error) {
+	if !cfg.Enabled {
+		// Return no-op provider when disabled
+		return &Provider{
+			Tracer:    otel.Tracer(cfg.ServiceName),
+			Meter:     otel.Meter(cfg.ServiceName),
+			startTime: time.Now(),
+		}, func(context.Context) error { return nil }, nil
+	}
+
+	// Create resource with service information
+	res, err := resource.Merge(
+		resource.Default(),
+		resource.NewWithAttributes(
+			semconv.SchemaURL,
+			semconv.ServiceName(cfg.ServiceName),
+			semconv.ServiceVersion(cfg.Version),
+		),
+	)
+	if err != nil {
+		return nil, nil, fmt.Errorf("create resource: %w", err)
+	}
+
+	// Setup gRPC connection options
+	var dialOpts []grpc.DialOption
+	if cfg.Insecure {
+		dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	}
+
+	// Create trace exporter
+	traceExporter, err := otlptracegrpc.New(ctx,
+		otlptracegrpc.WithEndpoint(cfg.Endpoint),
+		otlptracegrpc.WithDialOption(dialOpts...),
+	)
+	if err != nil {
+		return nil, nil, fmt.Errorf("create trace exporter: %w", err)
+	}
+
+	// Create tracer provider
+	tracerProvider := sdktrace.NewTracerProvider(
+		sdktrace.WithBatcher(traceExporter),
+		sdktrace.WithResource(res),
+	)
+
+	// Create metric exporter
+	metricExporter, err := otlpmetricgrpc.New(ctx,
+		otlpmetricgrpc.WithEndpoint(cfg.Endpoint),
+		otlpmetricgrpc.WithDialOption(dialOpts...),
+	)
+	if err != nil {
+		tracerProvider.Shutdown(ctx)
+		return nil, nil, fmt.Errorf("create metric exporter: %w", err)
+	}
+
+	// Create meter provider
+	meterProvider := sdkmetric.NewMeterProvider(
+		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExporter)),
+		sdkmetric.WithResource(res),
+	)
+
+	// Set global providers
+	otel.SetTracerProvider(tracerProvider)
+	otel.SetMeterProvider(meterProvider)
+	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
+		propagation.TraceContext{},
+		propagation.Baggage{},
+	))
+
+	// Start runtime metrics collection
+	if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil {
+		tracerProvider.Shutdown(ctx)
+		meterProvider.Shutdown(ctx)
+		return nil, nil, fmt.Errorf("start runtime metrics: %w", err)
+	}
+
+	provider := &Provider{
+		TracerProvider: tracerProvider,
+		MeterProvider:  meterProvider,
+		Tracer:         tracerProvider.Tracer(cfg.ServiceName),
+		Meter:          meterProvider.Meter(cfg.ServiceName),
+		startTime:      time.Now(),
+	}
+
+	// Register system metrics (uptime, info)
+	if err := provider.registerSystemMetrics(cfg); err != nil {
+		tracerProvider.Shutdown(ctx)
+		meterProvider.Shutdown(ctx)
+		return nil, nil, fmt.Errorf("register system metrics: %w", err)
+	}
+
+	shutdown := func(ctx context.Context) error {
+		var errs []error
+		if err := tracerProvider.Shutdown(ctx); err != nil {
+			errs = append(errs, fmt.Errorf("shutdown tracer: %w", err))
+		}
+		if err := meterProvider.Shutdown(ctx); err != nil {
+			errs = append(errs, fmt.Errorf("shutdown meter: %w", err))
+		}
+		if len(errs) > 0 {
+			return fmt.Errorf("shutdown errors: %v", errs)
+		}
+		return nil
+	}
+
+	return provider, shutdown, nil
+}
+
+// registerSystemMetrics registers uptime and info metrics.
+func (p *Provider) registerSystemMetrics(cfg Config) error {
+	// Uptime gauge
+	uptime, err := p.Meter.Float64ObservableGauge(
+		"hypeman_uptime_seconds",
+		metric.WithDescription("Process uptime in seconds"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return fmt.Errorf("create uptime gauge: %w", err)
+	}
+
+	// Info gauge (always 1, with version labels)
+	info, err := p.Meter.Int64ObservableGauge(
+		"hypeman_info",
+		metric.WithDescription("Hypeman build information"),
+	)
+	if err != nil {
+		return fmt.Errorf("create info gauge: %w", err)
+	}
+
+	_, err = p.Meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			o.ObserveFloat64(uptime, time.Since(p.startTime).Seconds())
+			o.ObserveInt64(info, 1,
+				metric.WithAttributes(
+					semconv.ServiceVersion(cfg.Version),
+					semconv.TelemetrySDKLanguageGo,
+				),
+			)
+			return nil
+		},
+		uptime,
+		info,
+	)
+	if err != nil {
+		return fmt.Errorf("register callback: %w", err)
+	}
+
+	return nil
+}
+
+// Tracer returns a tracer for the given subsystem.
+func (p *Provider) TracerFor(subsystem string) trace.Tracer {
+	if p.TracerProvider != nil {
+		return p.TracerProvider.Tracer(subsystem)
+	}
+	return otel.Tracer(subsystem)
+}
+
+// Meter returns a meter for the given subsystem.
+func (p *Provider) MeterFor(subsystem string) metric.Meter {
+	if p.MeterProvider != nil {
+		return p.MeterProvider.Meter(subsystem)
+	}
+	return otel.Meter(subsystem)
+}
+
+// GoVersion returns the Go version used to build the binary.
+func GoVersion() string {
+	return goruntime.Version()
+}
diff --git a/lib/providers/providers.go b/lib/providers/providers.go
index 2daa0749..fa9d0b7c 100644
--- a/lib/providers/providers.go
+++ b/lib/providers/providers.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
-	"os"
 
 	"github.com/c2h5oh/datasize"
 	"github.com/onkernel/hypeman/cmd/api/config"
@@ -17,11 +16,10 @@ import (
 	"github.com/onkernel/hypeman/lib/volumes"
 )
 
-// ProvideLogger provides a structured logger
+// ProvideLogger provides a structured logger with subsystem-specific levels
 func ProvideLogger() *slog.Logger {
-	return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
-		Level: slog.LevelInfo,
-	}))
+	cfg := logger.NewConfig()
+	return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg)
 }
 
 // ProvideContext provides a context with logger attached
@@ -41,7 +39,7 @@ func ProvidePaths(cfg *config.Config) *paths.Paths {
 
 // ProvideImageManager provides the image manager
 func ProvideImageManager(p *paths.Paths, cfg *config.Config) (images.Manager, error) {
-	return images.NewManager(p, cfg.MaxConcurrentBuilds)
+	return images.NewManager(p, cfg.MaxConcurrentBuilds, nil)
 }
 
 // ProvideSystemManager provides the system manager
@@ -51,7 +49,7 @@ func ProvideSystemManager(p *paths.Paths) system.Manager {
 
 // ProvideNetworkManager provides the network manager
 func ProvideNetworkManager(p *paths.Paths, cfg *config.Config) network.Manager {
-	return network.NewManager(p, cfg)
+	return network.NewManager(p, cfg, nil)
 }
 
 // ProvideInstanceManager provides the instance manager
@@ -90,7 +88,7 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima
 		MaxTotalMemory:       maxTotalMemory,
 	}
 
-	return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits), nil
+	return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil), nil
 }
 
 // ProvideVolumeManager provides the volume manager
@@ -105,5 +103,5 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager,
 		maxTotalVolumeStorage = int64(storageSize)
 	}
 
-	return volumes.NewManager(p, maxTotalVolumeStorage), nil
+	return volumes.NewManager(p, maxTotalVolumeStorage, nil), nil
 }
diff --git a/lib/vmm/metrics.go b/lib/vmm/metrics.go
new file mode 100644
index 00000000..39ef4dd1
--- /dev/null
+++ b/lib/vmm/metrics.go
@@ -0,0 +1,66 @@
+package vmm
+
+import (
+	"context"
+	"time"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Metrics holds the metrics instruments for VMM operations.
+type Metrics struct {
+	APIDuration    metric.Float64Histogram
+	APIErrorsTotal metric.Int64Counter
+}
+
+// NewMetrics creates VMM metrics instruments.
+// If meter is nil, returns nil (metrics disabled).
+func NewMetrics(meter metric.Meter) (*Metrics, error) {
+	if meter == nil {
+		return nil, nil
+	}
+
+	apiDuration, err := meter.Float64Histogram(
+		"hypeman_vmm_api_duration_seconds",
+		metric.WithDescription("Cloud Hypervisor API call duration"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	apiErrorsTotal, err := meter.Int64Counter(
+		"hypeman_vmm_api_errors_total",
+		metric.WithDescription("Total number of Cloud Hypervisor API errors"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		APIDuration:    apiDuration,
+		APIErrorsTotal: apiErrorsTotal,
+	}, nil
+}
+
+// RecordAPICall records the duration and status of an API call.
+func (m *Metrics) RecordAPICall(ctx context.Context, operation string, start time.Time, err error) {
+	if m == nil {
+		return
+	}
+
+	duration := time.Since(start).Seconds()
+	status := "success"
+	if err != nil {
+		status = "error"
+		m.APIErrorsTotal.Add(ctx, 1,
+			metric.WithAttributes(attribute.String("operation", operation)))
+	}
+
+	m.APIDuration.Record(ctx, duration,
+		metric.WithAttributes(
+			attribute.String("operation", operation),
+			attribute.String("status", status),
+		))
+}
diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go
index 2292e98c..ac5b375a 100644
--- a/lib/volumes/manager.go
+++ b/lib/volumes/manager.go
@@ -6,11 +6,14 @@ import (
 	"io"
 	"os"
 	"sync"
+	"syscall"
 	"time"
 
 	"github.com/nrednav/cuid2"
 	"github.com/onkernel/hypeman/lib/images"
 	"github.com/onkernel/hypeman/lib/paths"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
 )
 
 // Manager provides volume lifecycle operations
@@ -34,20 +37,126 @@ type Manager interface {
 	GetVolumePath(id string) string
 }
 
+// Metrics holds the metrics instruments for volume operations.
+type Metrics struct {
+	createDuration metric.Float64Histogram
+}
+
 type manager struct {
 	paths                 *paths.Paths
 	maxTotalVolumeStorage int64    // Maximum total volume storage in bytes (0 = unlimited)
 	volumeLocks           sync.Map // map[string]*sync.RWMutex - per-volume locks
+	metrics               *Metrics
 }
 
-// NewManager creates a new volumes manager
-// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited)
-func NewManager(p *paths.Paths, maxTotalVolumeStorage int64) Manager {
-	return &manager{
+// NewManager creates a new volumes manager.
+// maxTotalVolumeStorage is the maximum total volume storage in bytes (0 = unlimited).
+// If meter is nil, metrics are disabled.
+func NewManager(p *paths.Paths, maxTotalVolumeStorage int64, meter metric.Meter) Manager {
+	m := &manager{
 		paths:                 p,
 		maxTotalVolumeStorage: maxTotalVolumeStorage,
 		volumeLocks:           sync.Map{},
 	}
+
+	// Initialize metrics if meter is provided
+	if meter != nil {
+		metrics, err := newVolumeMetrics(meter, m)
+		if err == nil {
+			m.metrics = metrics
+		}
+	}
+
+	return m
+}
+
+// newVolumeMetrics creates and registers all volume metrics.
+func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_volumes_create_duration_seconds",
+		metric.WithDescription("Time to create a volume"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauges
+	volumesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_total",
+		metric.WithDescription("Total number of volumes"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	allocatedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_allocated_bytes",
+		metric.WithDescription("Total allocated/provisioned volume size in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	usedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_used_bytes",
+		metric.WithDescription("Actual disk space consumed by volumes in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			volumes, err := m.ListVolumes(ctx)
+			if err != nil {
+				return nil
+			}
+
+			o.ObserveInt64(volumesTotal, int64(len(volumes)))
+
+			var totalAllocated, totalUsed int64
+			for _, vol := range volumes {
+				// Allocated = provisioned size in GB
+				totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024
+
+				// Used = actual disk blocks consumed (for sparse files)
+				diskPath := m.paths.VolumeData(vol.Id)
+				if stat, err := os.Stat(diskPath); err == nil {
+					// Get actual blocks used via syscall
+					if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok {
+						totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units
+					}
+				}
+			}
+
+			o.ObserveInt64(allocatedBytes, totalAllocated)
+			o.ObserveInt64(usedBytes, totalUsed)
+			return nil
+		},
+		volumesTotal,
+		allocatedBytes,
+		usedBytes,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		createDuration: createDuration,
+	}, nil
+}
+
+// recordCreateDuration records the volume creation duration.
+func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	m.metrics.createDuration.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
 }
 
 // getVolumeLock returns or creates a lock for a specific volume
@@ -92,6 +201,8 @@ func (m *manager) calculateTotalVolumeStorage(ctx context.Context) (int64, error
 
 // CreateVolume creates a new volume
 func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*Volume, error) {
+	start := time.Now()
+
 	// Generate or use provided ID
 	id := cuid2.Generate()
 	if req.Id != nil && *req.Id != "" {
@@ -145,12 +256,15 @@ func (m *manager) CreateVolume(ctx context.Context, req CreateVolumeRequest) (*V
 		return nil, err
 	}
 
+	m.recordCreateDuration(ctx, start, "success")
 	return m.metadataToVolume(meta), nil
 }
 
 // CreateVolumeFromArchive creates a new volume pre-populated with content from a tar.gz archive.
 // The archive is safely extracted with size limits to prevent tar bombs.
 func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeFromArchiveRequest, archive io.Reader) (*Volume, error) {
+	start := time.Now()
+
 	// Generate or use provided ID
 	id := cuid2.Generate()
 	if req.Id != nil && *req.Id != "" {
@@ -223,6 +337,7 @@ func (m *manager) CreateVolumeFromArchive(ctx context.Context, req CreateVolumeF
 		return nil, err
 	}
 
+	m.recordCreateDuration(ctx, start, "success")
 	return m.metadataToVolume(meta), nil
 }
 
diff --git a/lib/volumes/manager_test.go b/lib/volumes/manager_test.go
index 3752e230..130fb1c4 100644
--- a/lib/volumes/manager_test.go
+++ b/lib/volumes/manager_test.go
@@ -23,7 +23,7 @@ func setupTestManager(t *testing.T) (Manager, *paths.Paths, func()) {
 	// Create required directories
 	require.NoError(t, os.MkdirAll(p.VolumesDir(), 0755))
 
-	manager := NewManager(p, 0) // 0 = unlimited storage
+	manager := NewManager(p, 0, nil) // 0 = unlimited storage
 
 	cleanup := func() {
 		os.RemoveAll(tmpDir)
@@ -389,4 +389,3 @@ func TestMultiAttach_ConcurrentRWConflict(t *testing.T) {
 	assert.Len(t, vol.Attachments, 1, "Should have exactly one attachment")
 	assert.False(t, vol.Attachments[0].Readonly, "Attachment should be read-write")
 }
-

From 0ea3118e7ac4b08982910541cad68ecd7e3cd98f Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 13:16:40 -0500
Subject: [PATCH 02/14] Update top level README

---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index dc7a3577..1bf3ff83 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,10 @@ Hypeman can be configured using the following environment variables:
 | `DNS_SERVER` | DNS server IP address for VMs | `1.1.1.1` |
 | `MAX_CONCURRENT_BUILDS` | Maximum number of concurrent image builds | `1` |
 | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` |
+| `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` |
+| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` |
+| `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` |
+| `LOG_LEVEL_<SUBSYSTEM>` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default |
 
 **Important: Subnet Configuration**
 
@@ -159,6 +163,24 @@ make dev
 ```
 The server will start on port 8080 (configurable via `PORT` environment variable).
 
+#### Local OpenTelemetry (optional)
+
+To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir):
+
+```bash
+# Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin)
+docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \
+  grafana/otel-lgtm:latest
+
+# Enable OTel in .env
+echo "OTEL_ENABLED=true" >> .env
+
+# Restart dev server
+make dev
+```
+
+Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana.
+
 ### Testing
 
 Network tests require elevated permissions to create bridges and TAP devices.

From 6d3f4cb8c62178a9bddf2798e83459206c2885ac Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 13:24:37 -0500
Subject: [PATCH 03/14] tweak global labels

---
 README.md                |  7 +++--
 cmd/api/config/config.go | 65 +++++++++++++++++++++++++++++++++-------
 cmd/api/main.go          | 12 ++++----
 lib/otel/README.md       |  2 ++
 lib/otel/otel.go         | 14 +++++----
 5 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 1bf3ff83..81f4a52b 100644
--- a/README.md
+++ b/README.md
@@ -75,8 +75,10 @@ Hypeman can be configured using the following environment variables:
 | `DNS_SERVER` | DNS server IP address for VMs | `1.1.1.1` |
 | `MAX_CONCURRENT_BUILDS` | Maximum number of concurrent image builds | `1` |
 | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` |
+| `ENV` | Deployment environment (filters telemetry, e.g. your name for dev) | `unset` |
 | `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` |
 | `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` |
+| `OTEL_SERVICE_INSTANCE_ID` | Instance ID for telemetry (differentiates multiple servers) | hostname |
 | `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` |
 | `LOG_LEVEL_<SUBSYSTEM>` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default |
 
@@ -172,14 +174,15 @@ To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana
 docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \
   grafana/otel-lgtm:latest
 
-# Enable OTel in .env
+# Enable OTel in .env (set ENV to your name to filter your telemetry)
 echo "OTEL_ENABLED=true" >> .env
+echo "ENV=yourname" >> .env
 
 # Restart dev server
 make dev
 ```
 
-Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana.
+Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. Filter by `deployment.environment="yourname"` to see only your data.
 
 ### Testing
 
diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go
index ec810d7e..d04a627e 100644
--- a/cmd/api/config/config.go
+++ b/cmd/api/config/config.go
@@ -2,11 +2,52 @@ package config
 
 import (
 	"os"
+	"runtime/debug"
 	"strconv"
 
 	"github.com/joho/godotenv"
 )
 
+func getHostname() string {
+	if h, err := os.Hostname(); err == nil {
+		return h
+	}
+	return "unknown"
+}
+
+// getBuildVersion extracts version info from Go's embedded build info.
+// Returns git short hash + "-dirty" suffix if uncommitted changes, or "unknown" if unavailable.
+func getBuildVersion() string {
+	info, ok := debug.ReadBuildInfo()
+	if !ok {
+		return "unknown"
+	}
+
+	var revision string
+	var dirty bool
+	for _, s := range info.Settings {
+		switch s.Key {
+		case "vcs.revision":
+			revision = s.Value
+		case "vcs.modified":
+			dirty = s.Value == "true"
+		}
+	}
+
+	if revision == "" {
+		return "unknown"
+	}
+
+	// Use short hash (8 chars)
+	if len(revision) > 8 {
+		revision = revision[:8]
+	}
+	if dirty {
+		revision += "-dirty"
+	}
+	return revision
+}
+
 type Config struct {
 	Port                string
 	DataDir             string
@@ -32,11 +73,13 @@ type Config struct {
 	MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited)
 
 	// OpenTelemetry configuration
-	OtelEnabled     bool   // Enable OpenTelemetry
-	OtelEndpoint    string // OTLP endpoint (gRPC)
-	OtelServiceName string // Service name for tracing
-	OtelInsecure    bool   // Disable TLS for OTLP
-	Version         string // Application version for telemetry
+	OtelEnabled           bool   // Enable OpenTelemetry
+	OtelEndpoint          string // OTLP endpoint (gRPC)
+	OtelServiceName       string // Service name for tracing
+	OtelServiceInstanceID string // Service instance ID (default: hostname)
+	OtelInsecure          bool   // Disable TLS for OTLP
+	Version               string // Application version for telemetry
+	Env                   string // Deployment environment (e.g., dev, staging, prod)
 
 	// Logging configuration
 	LogLevel string // Default log level (debug, info, warn, error)
@@ -73,11 +116,13 @@ func Load() *Config {
 		MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""),
 
 		// OpenTelemetry configuration
-		OtelEnabled:     getEnvBool("OTEL_ENABLED", false),
-		OtelEndpoint:    getEnv("OTEL_ENDPOINT", "localhost:4317"),
-		OtelServiceName: getEnv("OTEL_SERVICE_NAME", "hypeman"),
-		OtelInsecure:    getEnvBool("OTEL_INSECURE", true),
-		Version:         getEnv("VERSION", "dev"),
+		OtelEnabled:           getEnvBool("OTEL_ENABLED", false),
+		OtelEndpoint:          getEnv("OTEL_ENDPOINT", "localhost:4317"),
+		OtelServiceName:       getEnv("OTEL_SERVICE_NAME", "hypeman"),
+		OtelServiceInstanceID: getEnv("OTEL_SERVICE_INSTANCE_ID", getHostname()),
+		OtelInsecure:          getEnvBool("OTEL_INSECURE", true),
+		Version:               getEnv("VERSION", getBuildVersion()),
+		Env:                   getEnv("ENV", "unset"),
 
 		// Logging configuration
 		LogLevel: getEnv("LOG_LEVEL", "info"),
diff --git a/cmd/api/main.go b/cmd/api/main.go
index 4ff139b1..d5ae60e1 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -43,11 +43,13 @@ func run() error {
 
 	// Initialize OpenTelemetry (before wire initialization)
 	otelCfg := otel.Config{
-		Enabled:     cfg.OtelEnabled,
-		Endpoint:    cfg.OtelEndpoint,
-		ServiceName: cfg.OtelServiceName,
-		Insecure:    cfg.OtelInsecure,
-		Version:     cfg.Version,
+		Enabled:           cfg.OtelEnabled,
+		Endpoint:          cfg.OtelEndpoint,
+		ServiceName:       cfg.OtelServiceName,
+		ServiceInstanceID: cfg.OtelServiceInstanceID,
+		Insecure:          cfg.OtelInsecure,
+		Version:           cfg.Version,
+		Env:               cfg.Env,
 	}
 
 	otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg)
diff --git a/lib/otel/README.md b/lib/otel/README.md
index 0f0187a7..3c60c432 100644
--- a/lib/otel/README.md
+++ b/lib/otel/README.md
@@ -13,9 +13,11 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman.
 
 | Variable | Description | Default |
 |----------|-------------|---------|
+| `ENV` | Deployment environment (`deployment.environment` attribute) | `unset` |
 | `OTEL_ENABLED` | Enable OpenTelemetry | `false` |
 | `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` |
 | `OTEL_SERVICE_NAME` | Service name | `hypeman` |
+| `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname |
 | `OTEL_INSECURE` | Disable TLS for OTLP | `true` |
 
 ## Metrics
diff --git a/lib/otel/otel.go b/lib/otel/otel.go
index 7c3e8f20..26ea1e4e 100644
--- a/lib/otel/otel.go
+++ b/lib/otel/otel.go
@@ -24,11 +24,13 @@ import (
 
 // Config holds OpenTelemetry configuration.
 type Config struct {
-	Enabled     bool
-	Endpoint    string
-	ServiceName string
-	Insecure    bool
-	Version     string
+	Enabled           bool
+	Endpoint          string
+	ServiceName       string
+	ServiceInstanceID string
+	Insecure          bool
+	Version           string
+	Env               string
 }
 
 // Provider holds initialized OTel providers.
@@ -60,6 +62,8 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 			semconv.SchemaURL,
 			semconv.ServiceName(cfg.ServiceName),
 			semconv.ServiceVersion(cfg.Version),
+			semconv.ServiceInstanceID(cfg.ServiceInstanceID),
+			semconv.DeploymentEnvironment(cfg.Env),
 		),
 	)
 	if err != nil {

From e4772926e6d22c660e5afbf8213dadc897e879b5 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 14:10:24 -0500
Subject: [PATCH 04/14] Fix otel connection

---
 README.md                |  21 ++-
 cmd/api/config/config.go |   2 +-
 dashboards/hypeman.json  | 395 +++++++++++++++++++++++++++++++++++++++
 lib/otel/README.md       |   2 +-
 lib/otel/otel.go         |   4 +-
 5 files changed, 417 insertions(+), 7 deletions(-)
 create mode 100644 dashboards/hypeman.json

diff --git a/README.md b/README.md
index 81f4a52b..ae9712eb 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Hypeman can be configured using the following environment variables:
 | `MAX_OVERLAY_SIZE` | Maximum size for overlay filesystem | `100GB` |
 | `ENV` | Deployment environment (filters telemetry, e.g. your name for dev) | `unset` |
 | `OTEL_ENABLED` | Enable OpenTelemetry traces/metrics | `false` |
-| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `localhost:4317` |
+| `OTEL_ENDPOINT` | OTLP gRPC endpoint | `127.0.0.1:4317` |
 | `OTEL_SERVICE_INSTANCE_ID` | Instance ID for telemetry (differentiates multiple servers) | hostname |
 | `LOG_LEVEL` | Default log level (debug, info, warn, error) | `info` |
 | `LOG_LEVEL_<SUBSYSTEM>` | Per-subsystem log level (API, IMAGES, INSTANCES, NETWORK, VOLUMES, VMM, SYSTEM, EXEC) | inherits default |
@@ -171,9 +171,17 @@ To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana
 
 ```bash
 # Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin)
-docker run -d --name lgtm -p 3000:3000 -p 4317:4317 -p 4318:4318 \
+# Note, if you are developing on a shared server, you can use the same LGTM stack as your peer(s)
+# You will be able to sort your metrics, traces, and logs using the ENV configuration (see below)
+docker run -d --name lgtm \
+  -p 127.0.0.1:3000:3000 \
+  -p 127.0.0.1:4317:4317 \
+  -p 127.0.0.1:4318:4318 \
   grafana/otel-lgtm:latest
 
+# If developing on a remote server, forward the port to your local machine:
+# ssh -L 3001:localhost:3000 your-server  (then open http://localhost:3001)
+
 # Enable OTel in .env (set ENV to your name to filter your telemetry)
 echo "OTEL_ENABLED=true" >> .env
 echo "ENV=yourname" >> .env
@@ -182,7 +190,14 @@ echo "ENV=yourname" >> .env
 make dev
 ```
 
-Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana. Filter by `deployment.environment="yourname"` to see only your data.
+Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana.
+
+**Import the Hypeman dashboard:**
+1. Go to Dashboards → New → Import
+2. Upload `dashboards/hypeman.json` or paste its contents
+3. Select the Prometheus datasource and click Import
+
+Use the Environment/Instance dropdowns to filter by `deployment.environment` or `service.instance.id`.
 
 ### Testing
 
diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go
index d04a627e..18db2007 100644
--- a/cmd/api/config/config.go
+++ b/cmd/api/config/config.go
@@ -117,7 +117,7 @@ func Load() *Config {
 
 		// OpenTelemetry configuration
 		OtelEnabled:           getEnvBool("OTEL_ENABLED", false),
-		OtelEndpoint:          getEnv("OTEL_ENDPOINT", "localhost:4317"),
+		OtelEndpoint:          getEnv("OTEL_ENDPOINT", "127.0.0.1:4317"),
 		OtelServiceName:       getEnv("OTEL_SERVICE_NAME", "hypeman"),
 		OtelServiceInstanceID: getEnv("OTEL_SERVICE_INSTANCE_ID", getHostname()),
 		OtelInsecure:          getEnvBool("OTEL_INSECURE", true),
diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
new file mode 100644
index 00000000..dc5fb605
--- /dev/null
+++ b/dashboards/hypeman.json
@@ -0,0 +1,395 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "links": [],
+  "panels": [
+    {
+      "gridPos": { "h": 3, "w": 6, "x": 0, "y": 0 },
+      "id": 1,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_uptime_seconds",
+          "refId": "A"
+        }
+      ],
+      "title": "Uptime",
+      "type": "stat",
+      "fieldConfig": {
+        "defaults": { "unit": "s" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 3, "w": 6, "x": 6, "y": 0 },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_instances_total",
+          "legendFormat": "{{state}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Instances",
+      "type": "stat"
+    },
+    {
+      "gridPos": { "h": 3, "w": 6, "x": 12, "y": 0 },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_volumes_total",
+          "refId": "A"
+        }
+      ],
+      "title": "Volumes",
+      "type": "stat"
+    },
+    {
+      "gridPos": { "h": 3, "w": 6, "x": 18, "y": 0 },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_images_total",
+          "legendFormat": "{{status}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Images",
+      "type": "stat"
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 },
+      "id": 5,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(http_server_request_duration_seconds_sum[5m]) / rate(http_server_request_duration_seconds_count[5m])",
+          "legendFormat": "{{http_route}}",
+          "refId": "A"
+        }
+      ],
+      "title": "HTTP Request Latency (avg)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "s" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 },
+      "id": 6,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(http_server_request_duration_seconds_count[5m])",
+          "legendFormat": "{{http_route}} {{http_status_code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "HTTP Requests/sec",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 11 },
+      "id": 7,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, rate(hypeman_instances_create_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95 create",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, rate(hypeman_instances_restore_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95 restore",
+          "refId": "B"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, rate(hypeman_instances_standby_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95 standby",
+          "refId": "C"
+        }
+      ],
+      "title": "Instance Operation Latency (p95)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "s" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 11 },
+      "id": 8,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_volumes_allocated_bytes",
+          "legendFormat": "allocated",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_volumes_used_bytes",
+          "legendFormat": "used",
+          "refId": "B"
+        }
+      ],
+      "title": "Volume Storage",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "bytes" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 11 },
+      "id": 9,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_exec_bytes_sent_total[5m])",
+          "legendFormat": "sent (stdin)",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_exec_bytes_received_total[5m])",
+          "legendFormat": "received (stdout+stderr)",
+          "refId": "B"
+        }
+      ],
+      "title": "Exec Throughput",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "Bps" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 19 },
+      "id": 10,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_images_build_queue_length",
+          "legendFormat": "queue length",
+          "refId": "A"
+        }
+      ],
+      "title": "Image Build Queue",
+      "type": "timeseries"
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 19 },
+      "id": 11,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, rate(hypeman_vmm_api_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95 {{operation}}",
+          "refId": "A"
+        }
+      ],
+      "title": "VMM API Latency (p95)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "s" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 19 },
+      "id": 12,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_network_allocations_total",
+          "legendFormat": "IP allocations",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_network_tap_operations_total[5m])",
+          "legendFormat": "TAP ops/sec ({{operation}})",
+          "refId": "B"
+        }
+      ],
+      "title": "Network",
+      "type": "timeseries"
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 },
+      "id": 13,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "go_memstats_heap_inuse_bytes",
+          "legendFormat": "heap in use",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "go_memstats_stack_inuse_bytes",
+          "legendFormat": "stack in use",
+          "refId": "B"
+        }
+      ],
+      "title": "Go Memory",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "bytes" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 },
+      "id": 14,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "go_goroutines",
+          "legendFormat": "goroutines",
+          "refId": "A"
+        }
+      ],
+      "title": "Goroutines",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 38,
+  "tags": ["hypeman"],
+  "templating": {
+    "list": [
+      {
+        "current": { "selected": false, "text": "All", "value": "$__all" },
+        "datasource": { "type": "prometheus", "uid": "prometheus" },
+        "definition": "label_values(hypeman_uptime_seconds, deployment_environment)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Environment",
+        "multi": false,
+        "name": "env",
+        "options": [],
+        "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment)", "refId": "A" },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": { "selected": false, "text": "All", "value": "$__all" },
+        "datasource": { "type": "prometheus", "uid": "prometheus" },
+        "definition": "label_values(hypeman_uptime_seconds, service_instance_id)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Instance",
+        "multi": false,
+        "name": "instance",
+        "options": [],
+        "query": { "query": "label_values(hypeman_uptime_seconds, service_instance_id)", "refId": "A" },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Hypeman",
+  "uid": "hypeman-overview",
+  "version": 1,
+  "weekStart": ""
+}
+
diff --git a/lib/otel/README.md b/lib/otel/README.md
index 3c60c432..ff181ef3 100644
--- a/lib/otel/README.md
+++ b/lib/otel/README.md
@@ -15,7 +15,7 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman.
 |----------|-------------|---------|
 | `ENV` | Deployment environment (`deployment.environment` attribute) | `unset` |
 | `OTEL_ENABLED` | Enable OpenTelemetry | `false` |
-| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `localhost:4317` |
+| `OTEL_ENDPOINT` | OTLP endpoint (gRPC) | `127.0.0.1:4317` |
 | `OTEL_SERVICE_NAME` | Service name | `hypeman` |
 | `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname |
 | `OTEL_INSECURE` | Disable TLS for OTLP | `true` |
diff --git a/lib/otel/otel.go b/lib/otel/otel.go
index 26ea1e4e..21f47af8 100644
--- a/lib/otel/otel.go
+++ b/lib/otel/otel.go
@@ -16,7 +16,7 @@ import (
 	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
 	"go.opentelemetry.io/otel/sdk/resource"
 	sdktrace "go.opentelemetry.io/otel/sdk/trace"
-	semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
+	semconv "go.opentelemetry.io/otel/semconv/v1.37.0"
 	"go.opentelemetry.io/otel/trace"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
@@ -63,7 +63,7 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 			semconv.ServiceName(cfg.ServiceName),
 			semconv.ServiceVersion(cfg.Version),
 			semconv.ServiceInstanceID(cfg.ServiceInstanceID),
-			semconv.DeploymentEnvironment(cfg.Env),
+			semconv.DeploymentEnvironmentName(cfg.Env),
 		),
 	)
 	if err != nil {

From 3ce88c477cc2e97fc15e8d3c290bc697a815abba Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 14:12:38 -0500
Subject: [PATCH 05/14] Fix non-tls otel config

---
 lib/otel/otel.go | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/lib/otel/otel.go b/lib/otel/otel.go
index 21f47af8..acac499a 100644
--- a/lib/otel/otel.go
+++ b/lib/otel/otel.go
@@ -18,8 +18,6 @@ import (
 	sdktrace "go.opentelemetry.io/otel/sdk/trace"
 	semconv "go.opentelemetry.io/otel/semconv/v1.37.0"
 	"go.opentelemetry.io/otel/trace"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/credentials/insecure"
 )
 
 // Config holds OpenTelemetry configuration.
@@ -70,17 +68,14 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 		return nil, nil, fmt.Errorf("create resource: %w", err)
 	}
 
-	// Setup gRPC connection options
-	var dialOpts []grpc.DialOption
-	if cfg.Insecure {
-		dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	}
-
 	// Create trace exporter
-	traceExporter, err := otlptracegrpc.New(ctx,
+	traceOpts := []otlptracegrpc.Option{
 		otlptracegrpc.WithEndpoint(cfg.Endpoint),
-		otlptracegrpc.WithDialOption(dialOpts...),
-	)
+	}
+	if cfg.Insecure {
+		traceOpts = append(traceOpts, otlptracegrpc.WithInsecure())
+	}
+	traceExporter, err := otlptracegrpc.New(ctx, traceOpts...)
 	if err != nil {
 		return nil, nil, fmt.Errorf("create trace exporter: %w", err)
 	}
@@ -92,10 +87,13 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 	)
 
 	// Create metric exporter
-	metricExporter, err := otlpmetricgrpc.New(ctx,
+	metricOpts := []otlpmetricgrpc.Option{
 		otlpmetricgrpc.WithEndpoint(cfg.Endpoint),
-		otlpmetricgrpc.WithDialOption(dialOpts...),
-	)
+	}
+	if cfg.Insecure {
+		metricOpts = append(metricOpts, otlpmetricgrpc.WithInsecure())
+	}
+	metricExporter, err := otlpmetricgrpc.New(ctx, metricOpts...)
 	if err != nil {
 		tracerProvider.Shutdown(ctx)
 		return nil, nil, fmt.Errorf("create metric exporter: %w", err)

From b461a031aaaea87b99dec86ed81f188b693a17b8 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 14:34:13 -0500
Subject: [PATCH 06/14] Fix dashboard

---
 README.md               |  2 ++
 dashboards/hypeman.json | 23 +++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index ae9712eb..ac2a048f 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,8 @@ docker run -d --name lgtm \
   -p 127.0.0.1:3000:3000 \
   -p 127.0.0.1:4317:4317 \
   -p 127.0.0.1:4318:4318 \
+  -p 127.0.0.1:9090:9090 \
+  -p 127.0.0.1:4040:4040 \
   grafana/otel-lgtm:latest
 
 # If developing on a remote server, forward the port to your local machine:
diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
index dc5fb605..31c2d3f0 100644
--- a/dashboards/hypeman.json
+++ b/dashboards/hypeman.json
@@ -108,8 +108,8 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(http_server_request_duration_seconds_sum[5m]) / rate(http_server_request_duration_seconds_count[5m])",
-          "legendFormat": "{{http_route}}",
+          "expr": "rate(hypeman_http_request_duration_seconds_sum[5m]) / rate(hypeman_http_request_duration_seconds_count[5m])",
+          "legendFormat": "{{http_request_method}} {{http_route}}",
           "refId": "A"
         }
       ],
@@ -130,8 +130,8 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(http_server_request_duration_seconds_count[5m])",
-          "legendFormat": "{{http_route}} {{http_status_code}}",
+          "expr": "rate(hypeman_http_requests_total[5m])",
+          "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}",
           "refId": "A"
         }
       ],
@@ -306,14 +306,14 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_memstats_heap_inuse_bytes",
-          "legendFormat": "heap in use",
+          "expr": "go_memory_used_bytes",
+          "legendFormat": "memory used",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_memstats_stack_inuse_bytes",
-          "legendFormat": "stack in use",
+          "expr": "go_memory_gc_goal_bytes",
+          "legendFormat": "GC goal",
           "refId": "B"
         }
       ],
@@ -334,7 +334,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_goroutines",
+          "expr": "go_goroutine_count",
           "legendFormat": "goroutines",
           "refId": "A"
         }
@@ -351,14 +351,14 @@
       {
         "current": { "selected": false, "text": "All", "value": "$__all" },
         "datasource": { "type": "prometheus", "uid": "prometheus" },
-        "definition": "label_values(hypeman_uptime_seconds, deployment_environment)",
+        "definition": "label_values(hypeman_uptime_seconds, deployment_environment_name)",
         "hide": 0,
         "includeAll": true,
         "label": "Environment",
         "multi": false,
         "name": "env",
         "options": [],
-        "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment)", "refId": "A" },
+        "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "refId": "A" },
         "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
@@ -392,4 +392,3 @@
   "version": 1,
   "weekStart": ""
 }
-

From d4dffc2c929adf2785629bf6cd05cfb6bed4e4c4 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 15:10:29 -0500
Subject: [PATCH 07/14] Fix logs

---
 cmd/api/main.go | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/cmd/api/main.go b/cmd/api/main.go
index d5ae60e1..f8e91a37 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -9,6 +9,7 @@ import (
 	"net/http"
 	"os"
 	"os/signal"
+	"strings"
 	"syscall"
 	"time"
 
@@ -130,12 +131,8 @@ func run() error {
 	// Create router
 	r := chi.NewRouter()
 
-	// Add OpenTelemetry tracing middleware if enabled
-	if cfg.OtelEnabled {
-		r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r)))
-	}
-
-	// Add HTTP metrics middleware if OTel is enabled
+	// Prepare HTTP metrics middleware (applied inside API group, not globally)
+	// Global application breaks WebSocket (Hijacker) and SSE (Flusher)
 	var httpMetricsMw func(http.Handler) http.Handler
 	if otelProvider != nil && otelProvider.Meter != nil {
 		httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter)
@@ -143,10 +140,6 @@ func run() error {
 			httpMetricsMw = httpMetrics.Middleware
 		}
 	}
-	if httpMetricsMw == nil {
-		httpMetricsMw = mw.NoopHTTPMetrics()
-	}
-	r.Use(httpMetricsMw)
 
 	// Load OpenAPI spec for request validation
 	spec, err := oapi.GetSwagger()
@@ -174,6 +167,24 @@ func run() error {
 		r.Use(middleware.RealIP)
 		r.Use(middleware.Logger)
 		r.Use(middleware.Recoverer)
+
+		// OpenTelemetry middleware (inside group to avoid breaking WebSocket/SSE on other routes)
+		if cfg.OtelEnabled {
+			r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r)))
+		}
+		if httpMetricsMw != nil {
+			// Skip HTTP metrics for SSE streaming endpoints (logs)
+			r.Use(func(next http.Handler) http.Handler {
+				return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+					if strings.HasSuffix(r.URL.Path, "/logs") {
+						next.ServeHTTP(w, r)
+						return
+					}
+					httpMetricsMw(next).ServeHTTP(w, r)
+				})
+			})
+		}
+
 		r.Use(middleware.Timeout(60 * time.Second))
 
 		// OpenAPI request validation with authentication

From c287f6d6a3739e5409591314d04e547dd15fa598 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 15:33:03 -0500
Subject: [PATCH 08/14] Fix all metrics

---
 cmd/api/main.go            |   7 +-
 dashboards/hypeman.json    | 130 ++++++++++++++++++++++++++++---------
 lib/providers/providers.go |  14 ++--
 lib/vmm/client.go          |  60 +++++++++++++----
 lib/vmm/metrics.go         |   9 +++
 5 files changed, 171 insertions(+), 49 deletions(-)

diff --git a/cmd/api/main.go b/cmd/api/main.go
index f8e91a37..a50d4405 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -27,6 +27,7 @@ import (
 	mw "github.com/onkernel/hypeman/lib/middleware"
 	"github.com/onkernel/hypeman/lib/oapi"
 	"github.com/onkernel/hypeman/lib/otel"
+	"github.com/onkernel/hypeman/lib/vmm"
 	"github.com/riandyrn/otelchi"
 	"golang.org/x/sync/errgroup"
 )
@@ -68,12 +69,16 @@ func run() error {
 		}()
 	}
 
-	// Initialize exec metrics if OTel is enabled
+	// Initialize exec and vmm metrics if OTel is enabled
 	if otelProvider != nil && otelProvider.Meter != nil {
 		execMetrics, err := exec.NewMetrics(otelProvider.Meter)
 		if err == nil {
 			exec.SetMetrics(execMetrics)
 		}
+		vmmMetrics, err := vmm.NewMetrics(otelProvider.Meter)
+		if err == nil {
+			vmm.SetMetrics(vmmMetrics)
+		}
 	}
 
 	// Initialize app with wire
diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
index 31c2d3f0..d3b778e6 100644
--- a/dashboards/hypeman.json
+++ b/dashboards/hypeman.json
@@ -22,7 +22,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_uptime_seconds",
+          "expr": "hypeman_uptime_seconds{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "refId": "A"
         }
       ],
@@ -47,7 +47,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_instances_total",
+          "expr": "sum(hypeman_instances_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (state)",
           "legendFormat": "{{state}}",
           "refId": "A"
         }
@@ -69,7 +69,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_volumes_total",
+          "expr": "hypeman_volumes_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "refId": "A"
         }
       ],
@@ -90,7 +90,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_images_total",
+          "expr": "sum(hypeman_images_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}) by (status)",
           "legendFormat": "{{status}}",
           "refId": "A"
         }
@@ -108,7 +108,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_http_request_duration_seconds_sum[5m]) / rate(hypeman_http_request_duration_seconds_count[5m])",
+          "expr": "rate(hypeman_http_request_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m]) / rate(hypeman_http_request_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
           "legendFormat": "{{http_request_method}} {{http_route}}",
           "refId": "A"
         }
@@ -130,7 +130,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_http_requests_total[5m])",
+          "expr": "rate(hypeman_http_requests_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
           "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}",
           "refId": "A"
         }
@@ -152,19 +152,19 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, rate(hypeman_instances_create_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
           "legendFormat": "p95 create",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, rate(hypeman_instances_restore_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_restore_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
           "legendFormat": "p95 restore",
           "refId": "B"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, rate(hypeman_instances_standby_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_standby_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
           "legendFormat": "p95 standby",
           "refId": "C"
         }
@@ -186,13 +186,13 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_volumes_allocated_bytes",
+          "expr": "hypeman_volumes_allocated_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "allocated",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_volumes_used_bytes",
+          "expr": "hypeman_volumes_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "used",
           "refId": "B"
         }
@@ -214,13 +214,13 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_exec_bytes_sent_total[5m])",
+          "expr": "rate(hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
           "legendFormat": "sent (stdin)",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_exec_bytes_received_total[5m])",
+          "expr": "rate(hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
           "legendFormat": "received (stdout+stderr)",
           "refId": "B"
         }
@@ -242,12 +242,18 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_images_build_queue_length",
+          "expr": "hypeman_images_build_queue_length{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "queue length",
           "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "legendFormat": "pulls/sec ({{status}})",
+          "refId": "B"
         }
       ],
-      "title": "Image Build Queue",
+      "title": "Image Build Queue & Pulls",
       "type": "timeseries"
     },
     {
@@ -260,7 +266,7 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, rate(hypeman_vmm_api_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_vmm_api_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le, operation))",
           "legendFormat": "p95 {{operation}}",
           "refId": "A"
         }
@@ -282,13 +288,13 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "hypeman_network_allocations_total",
+          "expr": "hypeman_network_allocations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "IP allocations",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_network_tap_operations_total[5m])",
+          "expr": "rate(hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
           "legendFormat": "TAP ops/sec ({{operation}})",
           "refId": "B"
         }
@@ -297,7 +303,53 @@
       "type": "timeseries"
     },
     {
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 },
+      "id": 15,
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_images_build_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
+          "legendFormat": "p95 build",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "histogram_quantile(0.95, sum(rate(hypeman_volumes_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
+          "legendFormat": "p95 volume create",
+          "refId": "B"
+        }
+      ],
+      "title": "Image Build & Volume Create (p95)",
+      "type": "timeseries",
+      "fieldConfig": {
+        "defaults": { "unit": "s" },
+        "overrides": []
+      }
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 },
+      "id": 16,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "legendFormat": "{{from}} → {{to}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Instance State Transitions/sec",
+      "type": "timeseries"
+    },
+    {
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 },
       "id": 13,
       "options": {
         "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
@@ -306,13 +358,13 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_memory_used_bytes",
+          "expr": "go_memory_used_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "memory used",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_memory_gc_goal_bytes",
+          "expr": "go_memory_gc_goal_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "GC goal",
           "refId": "B"
         }
@@ -325,7 +377,7 @@
       }
     },
     {
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 },
       "id": 14,
       "options": {
         "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
@@ -334,13 +386,31 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "go_goroutine_count",
+          "expr": "go_goroutine_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "goroutines",
           "refId": "A"
         }
       ],
       "title": "Goroutines",
       "type": "timeseries"
+    },
+    {
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 },
+      "id": 17,
+      "options": {
+        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "single", "sort": "none" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "rate(hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "legendFormat": "sessions/sec ({{status}})",
+          "refId": "A"
+        }
+      ],
+      "title": "Exec Sessions/sec",
+      "type": "timeseries"
     }
   ],
   "refresh": "10s",
@@ -355,14 +425,14 @@
         "hide": 0,
         "includeAll": true,
         "label": "Environment",
-        "multi": false,
+        "multi": true,
         "name": "env",
         "options": [],
         "query": { "query": "label_values(hypeman_uptime_seconds, deployment_environment_name)", "refId": "A" },
-        "refresh": 1,
+        "refresh": 2,
         "regex": "",
         "skipUrlSync": false,
-        "sort": 0,
+        "sort": 1,
         "type": "query"
       },
       {
@@ -372,14 +442,14 @@
         "hide": 0,
         "includeAll": true,
         "label": "Instance",
-        "multi": false,
+        "multi": true,
         "name": "instance",
         "options": [],
         "query": { "query": "label_values(hypeman_uptime_seconds, service_instance_id)", "refId": "A" },
-        "refresh": 1,
+        "refresh": 2,
         "regex": "",
         "skipUrlSync": false,
-        "sort": 0,
+        "sort": 1,
         "type": "query"
       }
     ]
@@ -389,6 +459,6 @@
   "timezone": "",
   "title": "Hypeman",
   "uid": "hypeman-overview",
-  "version": 1,
+  "version": 2,
   "weekStart": ""
 }
diff --git a/lib/providers/providers.go b/lib/providers/providers.go
index fa9d0b7c..93713d13 100644
--- a/lib/providers/providers.go
+++ b/lib/providers/providers.go
@@ -14,6 +14,7 @@ import (
 	"github.com/onkernel/hypeman/lib/paths"
 	"github.com/onkernel/hypeman/lib/system"
 	"github.com/onkernel/hypeman/lib/volumes"
+	"go.opentelemetry.io/otel"
 )
 
 // ProvideLogger provides a structured logger with subsystem-specific levels
@@ -39,7 +40,8 @@ func ProvidePaths(cfg *config.Config) *paths.Paths {
 
 // ProvideImageManager provides the image manager
 func ProvideImageManager(p *paths.Paths, cfg *config.Config) (images.Manager, error) {
-	return images.NewManager(p, cfg.MaxConcurrentBuilds, nil)
+	meter := otel.GetMeterProvider().Meter("hypeman")
+	return images.NewManager(p, cfg.MaxConcurrentBuilds, meter)
 }
 
 // ProvideSystemManager provides the system manager
@@ -49,7 +51,8 @@ func ProvideSystemManager(p *paths.Paths) system.Manager {
 
 // ProvideNetworkManager provides the network manager
 func ProvideNetworkManager(p *paths.Paths, cfg *config.Config) network.Manager {
-	return network.NewManager(p, cfg, nil)
+	meter := otel.GetMeterProvider().Meter("hypeman")
+	return network.NewManager(p, cfg, meter)
 }
 
 // ProvideInstanceManager provides the instance manager
@@ -88,7 +91,9 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima
 		MaxTotalMemory:       maxTotalMemory,
 	}
 
-	return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, nil, nil), nil
+	meter := otel.GetMeterProvider().Meter("hypeman")
+	tracer := otel.GetTracerProvider().Tracer("hypeman")
+	return instances.NewManager(p, imageManager, systemManager, networkManager, volumeManager, limits, meter, tracer), nil
 }
 
 // ProvideVolumeManager provides the volume manager
@@ -103,5 +108,6 @@ func ProvideVolumeManager(p *paths.Paths, cfg *config.Config) (volumes.Manager,
 		maxTotalVolumeStorage = int64(storageSize)
 	}
 
-	return volumes.NewManager(p, maxTotalVolumeStorage, nil), nil
+	meter := otel.GetMeterProvider().Meter("hypeman")
+	return volumes.NewManager(p, maxTotalVolumeStorage, meter), nil
 }
diff --git a/lib/vmm/client.go b/lib/vmm/client.go
index c70a6572..7fc7a906 100644
--- a/lib/vmm/client.go
+++ b/lib/vmm/client.go
@@ -12,6 +12,8 @@ import (
 	"time"
 
 	"github.com/onkernel/hypeman/lib/paths"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
 )
 
 // VMM wraps the generated Cloud Hypervisor client (API v0.3.0)
@@ -20,15 +22,45 @@ type VMM struct {
 	socketPath string
 }
 
+// metricsRoundTripper wraps an http.RoundTripper to record metrics
+type metricsRoundTripper struct {
+	base http.RoundTripper
+}
+
+func (m *metricsRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
+	start := time.Now()
+	resp, err := m.base.RoundTrip(req)
+
+	// Record metrics using global VMMMetrics
+	if VMMMetrics != nil {
+		operation := req.Method + " " + req.URL.Path
+		status := "success"
+		if err != nil || (resp != nil && resp.StatusCode >= 400) {
+			status = "error"
+			VMMMetrics.APIErrorsTotal.Add(req.Context(), 1,
+				metric.WithAttributes(attribute.String("operation", operation)))
+		}
+		VMMMetrics.APIDuration.Record(req.Context(), time.Since(start).Seconds(),
+			metric.WithAttributes(
+				attribute.String("operation", operation),
+				attribute.String("status", status),
+			))
+	}
+
+	return resp, err
+}
+
 // NewVMM creates a Cloud Hypervisor client for an existing VMM socket
 func NewVMM(socketPath string) (*VMM, error) {
-	httpClient := &http.Client{
-		Transport: &http.Transport{
-			DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
-				return net.Dial("unix", socketPath)
-			},
+	transport := &http.Transport{
+		DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
+			return net.Dial("unix", socketPath)
 		},
-		Timeout: 30 * time.Second,
+	}
+
+	httpClient := &http.Client{
+		Transport: &metricsRoundTripper{base: transport},
+		Timeout:   30 * time.Second,
 	}
 
 	client, err := NewClientWithResponses("http://localhost/api/v1",
@@ -71,15 +103,15 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion
 	// Build command arguments
 	args := []string{"--api-socket", socketPath}
 	args = append(args, extraArgs...)
-	
+
 	// Use Command (not CommandContext) so process survives parent context cancellation
 	cmd := exec.Command(binaryPath, args...)
-	
+
 	// Daemonize: detach from parent process group
 	cmd.SysProcAttr = &syscall.SysProcAttr{
 		Setpgid: true, // Create new process group
 	}
-	
+
 	// Redirect stdout/stderr to log files (process won't block on I/O)
 	instanceDir := filepath.Dir(socketPath)
 	stdoutFile, err := os.OpenFile(
@@ -94,7 +126,7 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion
 	// The child process receives duplicated file descriptors during fork/exec,
 	// so it can continue writing to the log files even after we close them here.
 	defer stdoutFile.Close()
-	
+
 	stderrFile, err := os.OpenFile(
 		filepath.Join(instanceDir, "ch-stderr.log"),
 		os.O_CREATE|os.O_WRONLY|os.O_APPEND,
@@ -104,24 +136,24 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion
 		return 0, fmt.Errorf("create stderr log: %w", err)
 	}
 	defer stderrFile.Close()
-	
+
 	cmd.Stdout = stdoutFile
 	cmd.Stderr = stderrFile
 
 	if err := cmd.Start(); err != nil {
 		return 0, fmt.Errorf("start cloud-hypervisor: %w", err)
 	}
-	
+
 	pid := cmd.Process.Pid
 
 	// Wait for socket to be ready (use fresh context with timeout, not parent context)
 	waitCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
-	
+
 	if err := waitForSocket(waitCtx, socketPath, 5*time.Second); err != nil {
 		return 0, err
 	}
-	
+
 	return pid, nil
 }
 
diff --git a/lib/vmm/metrics.go b/lib/vmm/metrics.go
index 39ef4dd1..19b514cf 100644
--- a/lib/vmm/metrics.go
+++ b/lib/vmm/metrics.go
@@ -14,6 +14,15 @@ type Metrics struct {
 	APIErrorsTotal metric.Int64Counter
 }
 
+// VMMMetrics is the global metrics instance for the vmm package.
+// Set this via SetMetrics() during application initialization.
+var VMMMetrics *Metrics
+
+// SetMetrics sets the global metrics instance.
+func SetMetrics(m *Metrics) {
+	VMMMetrics = m
+}
+
 // NewMetrics creates VMM metrics instruments.
 // If meter is nil, returns nil (metrics disabled).
 func NewMetrics(meter metric.Meter) (*Metrics, error) {

From 774c9055a2aa4b54313e3d08a2612da9f8c11767 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 15:37:48 -0500
Subject: [PATCH 09/14] Fix queries

---
 dashboards/hypeman.json | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
index d3b778e6..652742fd 100644
--- a/dashboards/hypeman.json
+++ b/dashboards/hypeman.json
@@ -152,24 +152,24 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
-          "legendFormat": "p95 create",
+          "expr": "hypeman_instances_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg create",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_restore_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
-          "legendFormat": "p95 restore",
+          "expr": "hypeman_instances_restore_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_restore_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg restore",
           "refId": "B"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_instances_standby_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
-          "legendFormat": "p95 standby",
+          "expr": "hypeman_instances_standby_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_instances_standby_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg standby",
           "refId": "C"
         }
       ],
-      "title": "Instance Operation Latency (p95)",
+      "title": "Instance Operation Latency (avg)",
       "type": "timeseries",
       "fieldConfig": {
         "defaults": { "unit": "s" },
@@ -266,12 +266,12 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_vmm_api_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le, operation))",
-          "legendFormat": "p95 {{operation}}",
+          "expr": "hypeman_vmm_api_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_vmm_api_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg {{operation}}",
           "refId": "A"
         }
       ],
-      "title": "VMM API Latency (p95)",
+      "title": "VMM API Latency (avg)",
       "type": "timeseries",
       "fieldConfig": {
         "defaults": { "unit": "s" },
@@ -312,18 +312,18 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_images_build_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
-          "legendFormat": "p95 build",
+          "expr": "hypeman_images_build_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_images_build_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg build",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum(rate(hypeman_volumes_create_duration_seconds_bucket{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])) by (le))",
-          "legendFormat": "p95 volume create",
+          "expr": "hypeman_volumes_create_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_volumes_create_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg volume create",
           "refId": "B"
         }
       ],
-      "title": "Image Build & Volume Create (p95)",
+      "title": "Image Build & Volume Create (avg)",
       "type": "timeseries",
       "fieldConfig": {
         "defaults": { "unit": "s" },

From b902b731feb39627d35fa554b836729754ce333c Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 15:40:39 -0500
Subject: [PATCH 10/14] Fix exec queries

---
 dashboards/hypeman.json | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json
index 652742fd..c2c46c55 100644
--- a/dashboards/hypeman.json
+++ b/dashboards/hypeman.json
@@ -214,21 +214,21 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "expr": "hypeman_exec_bytes_sent_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "sent (stdin)",
           "refId": "A"
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "expr": "hypeman_exec_bytes_received_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "received (stdout+stderr)",
           "refId": "B"
         }
       ],
-      "title": "Exec Throughput",
+      "title": "Exec Bytes (cumulative)",
       "type": "timeseries",
       "fieldConfig": {
-        "defaults": { "unit": "Bps" },
+        "defaults": { "unit": "bytes" },
         "overrides": []
       }
     },
@@ -248,8 +248,8 @@
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
-          "legendFormat": "pulls/sec ({{status}})",
+          "expr": "hypeman_images_pulls_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "pulls ({{status}})",
           "refId": "B"
         }
       ],
@@ -294,8 +294,8 @@
         },
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
-          "legendFormat": "TAP ops/sec ({{operation}})",
+          "expr": "hypeman_network_tap_operations_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "TAP ops ({{operation}})",
           "refId": "B"
         }
       ],
@@ -340,12 +340,12 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
+          "expr": "hypeman_instances_state_transitions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
           "legendFormat": "{{from}} → {{to}}",
           "refId": "A"
         }
       ],
-      "title": "Instance State Transitions/sec",
+      "title": "Instance State Transitions",
       "type": "timeseries"
     },
     {
@@ -404,12 +404,18 @@
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "prometheus" },
-          "expr": "rate(hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}[5m])",
-          "legendFormat": "sessions/sec ({{status}})",
+          "expr": "hypeman_exec_sessions_total{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "sessions ({{status}}, exit={{exit_code}})",
           "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "expr": "hypeman_exec_duration_seconds_sum{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / hypeman_exec_duration_seconds_count{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}",
+          "legendFormat": "avg duration (s)",
+          "refId": "B"
         }
       ],
-      "title": "Exec Sessions/sec",
+      "title": "Exec Sessions & Duration",
       "type": "timeseries"
     }
   ],

From c9613d11d438e64d84c086b00436e9f178bc1ef0 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 15:47:40 -0500
Subject: [PATCH 11/14] Add otel export of logs

---
 README.md                  |  2 +-
 cmd/api/main.go            |  5 +++
 go.mod                     |  4 +++
 go.sum                     |  8 +++++
 lib/logger/logger.go       | 74 +++++++++++++++++++++++++++++++++++---
 lib/otel/README.md         | 14 ++++++--
 lib/otel/otel.go           | 49 +++++++++++++++++++++++++
 lib/providers/providers.go |  4 ++-
 8 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ac2a048f..5624ea6c 100644
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ echo "ENV=yourname" >> .env
 make dev
 ```
 
-Open http://localhost:3000 to view traces (Tempo) and metrics (Mimir) in Grafana.
+Open http://localhost:3000 to view traces (Tempo), metrics (Mimir), and logs (Loki) in Grafana.
 
 **Import the Hypeman dashboard:**
 1. Go to Dashboards → New → Import
diff --git a/cmd/api/main.go b/cmd/api/main.go
index a50d4405..59413ddf 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -81,6 +81,11 @@ func run() error {
 		}
 	}
 
+	// Set global OTel log handler for logger package
+	if otelProvider != nil && otelProvider.LogHandler != nil {
+		otel.SetGlobalLogHandler(otelProvider.LogHandler)
+	}
+
 	// Initialize app with wire
 	app, cleanup, err := initializeApp()
 	if err != nil {
diff --git a/go.mod b/go.mod
index 7c894df1..210a679a 100644
--- a/go.mod
+++ b/go.mod
@@ -88,7 +88,11 @@ require (
 	github.com/vishvananda/netns v0.0.5 // indirect
 	github.com/woodsbury/decimal128 v1.3.0 // indirect
 	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
+	go.opentelemetry.io/otel/log v0.14.0 // indirect
+	go.opentelemetry.io/otel/sdk/log v0.14.0 // indirect
 	go.opentelemetry.io/proto/otlp v1.7.1 // indirect
 	golang.org/x/crypto v0.43.0 // indirect
 	golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect
diff --git a/go.sum b/go.sum
index 91d5185a..e1226b84 100644
--- a/go.sum
+++ b/go.sum
@@ -198,20 +198,28 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj
 github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 h1:bwnLpizECbPr1RrQ27waeY2SPIPeccCx/xLuoYADZ9s=
+go.opentelemetry.io/contrib/bridges/otelslog v0.13.0/go.mod h1:3nWlOiiqA9UtUnrcNk82mYasNxD8ehOspL0gOfEo6Y4=
 go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ=
 go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw=
 go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
 go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 h1:OMqPldHt79PqWKOMYIAQs3CxAi7RLgPxwfFSwr4ZxtM=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0/go.mod h1:1biG4qiqTxKiUCtoWDPpL3fB3KxVwCiGw81j3nKMuHE=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
+go.opentelemetry.io/otel/log v0.14.0 h1:2rzJ+pOAZ8qmZ3DDHg73NEKzSZkhkGIua9gXtxNGgrM=
+go.opentelemetry.io/otel/log v0.14.0/go.mod h1:5jRG92fEAgx0SU/vFPxmJvhIuDU9E1SUnEQrMlJpOno=
 go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
 go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
 go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
 go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
+go.opentelemetry.io/otel/sdk/log v0.14.0 h1:JU/U3O7N6fsAXj0+CXz21Czg532dW2V4gG1HE/e8Zrg=
+go.opentelemetry.io/otel/sdk/log v0.14.0/go.mod h1:imQvII+0ZylXfKU7/wtOND8Hn4OpT3YUoIgqJVksUkM=
 go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
 go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
 go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
diff --git a/lib/logger/logger.go b/lib/logger/logger.go
index 23a69ef3..6d60fcde 100644
--- a/lib/logger/logger.go
+++ b/lib/logger/logger.go
@@ -101,16 +101,29 @@ func NewLogger(cfg Config) *slog.Logger {
 }
 
 // NewSubsystemLogger creates a logger for a specific subsystem with its configured level.
-func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger {
+// If otelHandler is provided, logs will be sent both to stdout and to OTel.
+func NewSubsystemLogger(subsystem string, cfg Config, otelHandler slog.Handler) *slog.Logger {
 	level := cfg.LevelFor(subsystem)
-	handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
+	jsonHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: cfg.AddSource,
 	})
-	// Wrap with trace context handler if we want trace IDs in logs
+
+	var baseHandler slog.Handler
+	if otelHandler != nil {
+		// Use multi-handler to write to both stdout and OTel
+		baseHandler = &multiHandler{
+			handlers: []slog.Handler{jsonHandler, otelHandler},
+		}
+	} else {
+		baseHandler = jsonHandler
+	}
+
+	// Wrap with trace context handler for trace IDs in logs
 	wrappedHandler := &traceContextHandler{
-		Handler:   handler,
+		Handler:   baseHandler,
 		subsystem: subsystem,
+		level:     level,
 	}
 	return slog.New(wrappedHandler)
 }
@@ -119,6 +132,12 @@ func NewSubsystemLogger(subsystem string, cfg Config) *slog.Logger {
 type traceContextHandler struct {
 	slog.Handler
 	subsystem string
+	level     slog.Level
+}
+
+// Enabled reports whether the handler handles records at the given level.
+func (h *traceContextHandler) Enabled(ctx context.Context, level slog.Level) bool {
+	return level >= h.level
 }
 
 // Handle adds trace_id and span_id from the context if available.
@@ -143,6 +162,7 @@ func (h *traceContextHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
 	return &traceContextHandler{
 		Handler:   h.Handler.WithAttrs(attrs),
 		subsystem: h.subsystem,
+		level:     h.level,
 	}
 }
 
@@ -151,7 +171,53 @@ func (h *traceContextHandler) WithGroup(name string) slog.Handler {
 	return &traceContextHandler{
 		Handler:   h.Handler.WithGroup(name),
 		subsystem: h.subsystem,
+		level:     h.level,
+	}
+}
+
+// multiHandler fans out log records to multiple handlers.
+type multiHandler struct {
+	handlers []slog.Handler
+}
+
+// Enabled returns true if any handler is enabled.
+func (m *multiHandler) Enabled(ctx context.Context, level slog.Level) bool {
+	for _, h := range m.handlers {
+		if h.Enabled(ctx, level) {
+			return true
+		}
+	}
+	return false
+}
+
+// Handle writes the record to all handlers.
+func (m *multiHandler) Handle(ctx context.Context, r slog.Record) error {
+	for _, h := range m.handlers {
+		if h.Enabled(ctx, r.Level) {
+			if err := h.Handle(ctx, r); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// WithAttrs returns a new multiHandler with the given attributes.
+func (m *multiHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+	handlers := make([]slog.Handler, len(m.handlers))
+	for i, h := range m.handlers {
+		handlers[i] = h.WithAttrs(attrs)
+	}
+	return &multiHandler{handlers: handlers}
+}
+
+// WithGroup returns a new multiHandler with the given group name.
+func (m *multiHandler) WithGroup(name string) slog.Handler {
+	handlers := make([]slog.Handler, len(m.handlers))
+	for i, h := range m.handlers {
+		handlers[i] = h.WithGroup(name)
 	}
+	return &multiHandler{handlers: handlers}
 }
 
 // AddToContext adds a logger to the context.
diff --git a/lib/otel/README.md b/lib/otel/README.md
index ff181ef3..6db7a686 100644
--- a/lib/otel/README.md
+++ b/lib/otel/README.md
@@ -4,9 +4,10 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman.
 
 ## Features
 
-- OTLP export for traces and metrics (gRPC)
+- OTLP export for traces, metrics, and logs (gRPC)
 - Runtime metrics (Go GC, goroutines, memory)
 - Application-specific metrics per subsystem
+- Log bridging from slog to OTel (viewable in Grafana/Loki)
 - Graceful degradation (failures don't crash the app)
 
 ## Configuration
@@ -89,6 +90,15 @@ provider, shutdown, err := otel.Init(ctx, otel.Config{
 })
 defer shutdown(ctx)
 
-meter := provider.Meter // Use for creating metrics
+meter := provider.Meter       // Use for creating metrics
+tracer := provider.Tracer     // Use for creating traces
+logHandler := provider.LogHandler // Use with slog for logs to OTel
 ```
 
+## Logs
+
+Logs are exported via the OTel log bridge (`otelslog`). When OTel is enabled, all slog logs are sent to Loki (via OTLP) and include:
+- `subsystem` attribute (API, IMAGES, INSTANCES, etc.)
+- `trace_id` and `span_id` when available
+- Service attributes (name, instance, environment)
+
diff --git a/lib/otel/otel.go b/lib/otel/otel.go
index acac499a..2b1c2e41 100644
--- a/lib/otel/otel.go
+++ b/lib/otel/otel.go
@@ -4,15 +4,19 @@ package otel
 import (
 	"context"
 	"fmt"
+	"log/slog"
 	goruntime "runtime"
 	"time"
 
+	"go.opentelemetry.io/contrib/bridges/otelslog"
 	otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime"
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc"
 	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
 	"go.opentelemetry.io/otel/metric"
 	"go.opentelemetry.io/otel/propagation"
+	sdklog "go.opentelemetry.io/otel/sdk/log"
 	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
 	"go.opentelemetry.io/otel/sdk/resource"
 	sdktrace "go.opentelemetry.io/otel/sdk/trace"
@@ -35,8 +39,10 @@ type Config struct {
 type Provider struct {
 	TracerProvider *sdktrace.TracerProvider
 	MeterProvider  *sdkmetric.MeterProvider
+	LoggerProvider *sdklog.LoggerProvider
 	Tracer         trace.Tracer
 	Meter          metric.Meter
+	LogHandler     slog.Handler
 	startTime      time.Time
 }
 
@@ -105,6 +111,26 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 		sdkmetric.WithResource(res),
 	)
 
+	// Create log exporter
+	logOpts := []otlploggrpc.Option{
+		otlploggrpc.WithEndpoint(cfg.Endpoint),
+	}
+	if cfg.Insecure {
+		logOpts = append(logOpts, otlploggrpc.WithInsecure())
+	}
+	logExporter, err := otlploggrpc.New(ctx, logOpts...)
+	if err != nil {
+		tracerProvider.Shutdown(ctx)
+		meterProvider.Shutdown(ctx)
+		return nil, nil, fmt.Errorf("create log exporter: %w", err)
+	}
+
+	// Create logger provider
+	loggerProvider := sdklog.NewLoggerProvider(
+		sdklog.WithProcessor(sdklog.NewBatchProcessor(logExporter)),
+		sdklog.WithResource(res),
+	)
+
 	// Set global providers
 	otel.SetTracerProvider(tracerProvider)
 	otel.SetMeterProvider(meterProvider)
@@ -117,14 +143,20 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 	if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil {
 		tracerProvider.Shutdown(ctx)
 		meterProvider.Shutdown(ctx)
+		loggerProvider.Shutdown(ctx)
 		return nil, nil, fmt.Errorf("start runtime metrics: %w", err)
 	}
 
+	// Create slog handler that bridges to OTel
+	logHandler := otelslog.NewHandler(cfg.ServiceName, otelslog.WithLoggerProvider(loggerProvider))
+
 	provider := &Provider{
 		TracerProvider: tracerProvider,
 		MeterProvider:  meterProvider,
+		LoggerProvider: loggerProvider,
 		Tracer:         tracerProvider.Tracer(cfg.ServiceName),
 		Meter:          meterProvider.Meter(cfg.ServiceName),
+		LogHandler:     logHandler,
 		startTime:      time.Now(),
 	}
 
@@ -132,6 +164,7 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 	if err := provider.registerSystemMetrics(cfg); err != nil {
 		tracerProvider.Shutdown(ctx)
 		meterProvider.Shutdown(ctx)
+		loggerProvider.Shutdown(ctx)
 		return nil, nil, fmt.Errorf("register system metrics: %w", err)
 	}
 
@@ -143,6 +176,9 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err
 		if err := meterProvider.Shutdown(ctx); err != nil {
 			errs = append(errs, fmt.Errorf("shutdown meter: %w", err))
 		}
+		if err := loggerProvider.Shutdown(ctx); err != nil {
+			errs = append(errs, fmt.Errorf("shutdown logger: %w", err))
+		}
 		if len(errs) > 0 {
 			return fmt.Errorf("shutdown errors: %v", errs)
 		}
@@ -214,3 +250,16 @@ func (p *Provider) MeterFor(subsystem string) metric.Meter {
 func GoVersion() string {
 	return goruntime.Version()
 }
+
+// globalLogHandler holds the OTel log handler for use by the logger package.
+var globalLogHandler slog.Handler
+
+// SetGlobalLogHandler sets the global OTel log handler.
+func SetGlobalLogHandler(h slog.Handler) {
+	globalLogHandler = h
+}
+
+// GetGlobalLogHandler returns the global OTel log handler.
+func GetGlobalLogHandler() slog.Handler {
+	return globalLogHandler
+}
diff --git a/lib/providers/providers.go b/lib/providers/providers.go
index 93713d13..010ee3e6 100644
--- a/lib/providers/providers.go
+++ b/lib/providers/providers.go
@@ -11,6 +11,7 @@ import (
 	"github.com/onkernel/hypeman/lib/instances"
 	"github.com/onkernel/hypeman/lib/logger"
 	"github.com/onkernel/hypeman/lib/network"
+	hypemanotel "github.com/onkernel/hypeman/lib/otel"
 	"github.com/onkernel/hypeman/lib/paths"
 	"github.com/onkernel/hypeman/lib/system"
 	"github.com/onkernel/hypeman/lib/volumes"
@@ -20,7 +21,8 @@ import (
 // ProvideLogger provides a structured logger with subsystem-specific levels
 func ProvideLogger() *slog.Logger {
 	cfg := logger.NewConfig()
-	return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg)
+	otelHandler := hypemanotel.GetGlobalLogHandler()
+	return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler)
 }
 
 // ProvideContext provides a context with logger attached

From 0307c56281f52451d43aafee4f4ce1ea62691bf1 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 16:06:55 -0500
Subject: [PATCH 12/14] associate logs and traces

---
 cmd/api/api/images.go    |  8 ++--
 cmd/api/api/instances.go | 24 ++++++------
 cmd/api/api/volumes.go   | 16 ++++----
 cmd/api/main.go          | 19 ++++++++--
 lib/middleware/otel.go   | 79 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+), 28 deletions(-)

diff --git a/cmd/api/api/images.go b/cmd/api/api/images.go
index 913b9918..68f4cc96 100644
--- a/cmd/api/api/images.go
+++ b/cmd/api/api/images.go
@@ -14,7 +14,7 @@ func (s *ApiService) ListImages(ctx context.Context, request oapi.ListImagesRequ
 
 	domainImages, err := s.ImageManager.ListImages(ctx)
 	if err != nil {
-		log.Error("failed to list images", "error", err)
+		log.ErrorContext(ctx, "failed to list images", "error", err)
 		return oapi.ListImages500JSONResponse{
 			Code:    "internal_error",
 			Message: "failed to list images",
@@ -50,7 +50,7 @@ func (s *ApiService) CreateImage(ctx context.Context, request oapi.CreateImageRe
 				Message: "image not found",
 			}, nil
 		default:
-			log.Error("failed to create image", "error", err, "name", request.Body.Name)
+			log.ErrorContext(ctx, "failed to create image", "error", err, "name", request.Body.Name)
 			return oapi.CreateImage500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to create image",
@@ -72,7 +72,7 @@ func (s *ApiService) GetImage(ctx context.Context, request oapi.GetImageRequestO
 				Message: "image not found",
 			}, nil
 		default:
-			log.Error("failed to get image", "error", err, "name", request.Name)
+			log.ErrorContext(ctx, "failed to get image", "error", err, "name", request.Name)
 			return oapi.GetImage500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to get image",
@@ -94,7 +94,7 @@ func (s *ApiService) DeleteImage(ctx context.Context, request oapi.DeleteImageRe
 				Message: "image not found",
 			}, nil
 		default:
-			log.Error("failed to delete image", "error", err, "name", request.Name)
+			log.ErrorContext(ctx, "failed to delete image", "error", err, "name", request.Name)
 			return oapi.DeleteImage500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to delete image",
diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go
index 2d84ac08..12f5bd7b 100644
--- a/cmd/api/api/instances.go
+++ b/cmd/api/api/instances.go
@@ -21,18 +21,18 @@ func (s *ApiService) ListInstances(ctx context.Context, request oapi.ListInstanc
 
 	domainInsts, err := s.InstanceManager.ListInstances(ctx)
 	if err != nil {
-		log.Error("failed to list instances", "error", err)
+		log.ErrorContext(ctx, "failed to list instances", "error", err)
 		return oapi.ListInstances500JSONResponse{
 			Code:    "internal_error",
 			Message: "failed to list instances",
 		}, nil
 	}
-	
+
 	oapiInsts := make([]oapi.Instance, len(domainInsts))
 	for i, inst := range domainInsts {
 		oapiInsts[i] = instanceToOAPI(inst)
 	}
-	
+
 	return oapi.ListInstances200JSONResponse(oapiInsts), nil
 }
 
@@ -160,7 +160,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
 				Message: err.Error(),
 			}, nil
 		default:
-			log.Error("failed to create instance", "error", err, "image", request.Body.Image)
+			log.ErrorContext(ctx, "failed to create instance", "error", err, "image", request.Body.Image)
 			return oapi.CreateInstance500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to create instance",
@@ -195,7 +195,7 @@ func (s *ApiService) GetInstance(ctx context.Context, request oapi.GetInstanceRe
 				Message: "multiple instances have this name, use instance ID instead",
 			}, nil
 		default:
-			log.Error("failed to get instance", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to get instance", "error", err, "id", request.Id)
 			return oapi.GetInstance500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to get instance",
@@ -235,7 +235,7 @@ func (s *ApiService) DeleteInstance(ctx context.Context, request oapi.DeleteInst
 				Message: "instance not found",
 			}, nil
 		default:
-			log.Error("failed to delete instance", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to delete instance", "error", err, "id", request.Id)
 			return oapi.DeleteInstance500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to delete instance",
@@ -280,7 +280,7 @@ func (s *ApiService) StandbyInstance(ctx context.Context, request oapi.StandbyIn
 				Message: err.Error(),
 			}, nil
 		default:
-			log.Error("failed to standby instance", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to standby instance", "error", err, "id", request.Id)
 			return oapi.StandbyInstance500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to standby instance",
@@ -325,7 +325,7 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn
 				Message: err.Error(),
 			}, nil
 		default:
-			log.Error("failed to restore instance", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to restore instance", "error", err, "id", request.Id)
 			return oapi.RestoreInstance500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to restore instance",
@@ -417,18 +417,18 @@ func (s *ApiService) GetInstanceLogs(ctx context.Context, request oapi.GetInstan
 
 // AttachVolume attaches a volume to an instance (not yet implemented)
 func (s *ApiService) AttachVolume(ctx context.Context, request oapi.AttachVolumeRequestObject) (oapi.AttachVolumeResponseObject, error) {
-			return oapi.AttachVolume500JSONResponse{
+	return oapi.AttachVolume500JSONResponse{
 		Code:    "not_implemented",
 		Message: "volume attachment not yet implemented",
-			}, nil
+	}, nil
 }
 
 // DetachVolume detaches a volume from an instance (not yet implemented)
 func (s *ApiService) DetachVolume(ctx context.Context, request oapi.DetachVolumeRequestObject) (oapi.DetachVolumeResponseObject, error) {
-			return oapi.DetachVolume500JSONResponse{
+	return oapi.DetachVolume500JSONResponse{
 		Code:    "not_implemented",
 		Message: "volume detachment not yet implemented",
-			}, nil
+	}, nil
 }
 
 // instanceToOAPI converts domain Instance to OAPI Instance
diff --git a/cmd/api/api/volumes.go b/cmd/api/api/volumes.go
index 8b4afd88..6e903801 100644
--- a/cmd/api/api/volumes.go
+++ b/cmd/api/api/volumes.go
@@ -18,18 +18,18 @@ func (s *ApiService) ListVolumes(ctx context.Context, request oapi.ListVolumesRe
 
 	domainVols, err := s.VolumeManager.ListVolumes(ctx)
 	if err != nil {
-		log.Error("failed to list volumes", "error", err)
+		log.ErrorContext(ctx, "failed to list volumes", "error", err)
 		return oapi.ListVolumes500JSONResponse{
 			Code:    "internal_error",
 			Message: "failed to list volumes",
 		}, nil
 	}
-	
+
 	oapiVols := make([]oapi.Volume, len(domainVols))
 	for i, vol := range domainVols {
 		oapiVols[i] = volumeToOAPI(vol)
 	}
-	
+
 	return oapi.ListVolumes200JSONResponse(oapiVols), nil
 }
 
@@ -56,7 +56,7 @@ func (s *ApiService) CreateVolume(ctx context.Context, request oapi.CreateVolume
 					Message: "volume with this ID already exists",
 				}, nil
 			}
-			log.Error("failed to create volume", "error", err, "name", request.JSONBody.Name)
+			log.ErrorContext(ctx, "failed to create volume", "error", err, "name", request.JSONBody.Name)
 			return oapi.CreateVolume500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to create volume",
@@ -171,7 +171,7 @@ func (s *ApiService) createVolumeFromMultipart(ctx context.Context, multipartRea
 						Message: "volume with this ID already exists",
 					}, nil
 				}
-				log.Error("failed to create volume from archive", "error", err, "name", name)
+				log.ErrorContext(ctx, "failed to create volume from archive", "error", err, "name", name)
 				return oapi.CreateVolume500JSONResponse{
 					Code:    "internal_error",
 					Message: "failed to create volume",
@@ -222,7 +222,7 @@ func (s *ApiService) GetVolume(ctx context.Context, request oapi.GetVolumeReques
 				Message: "multiple volumes have this name, use volume ID instead",
 			}, nil
 		default:
-			log.Error("failed to get volume", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to get volume", "error", err, "id", request.Id)
 			return oapi.GetVolume500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to get volume",
@@ -269,7 +269,7 @@ func (s *ApiService) DeleteVolume(ctx context.Context, request oapi.DeleteVolume
 				Message: "volume is in use by an instance",
 			}, nil
 		default:
-			log.Error("failed to delete volume", "error", err, "id", request.Id)
+			log.ErrorContext(ctx, "failed to delete volume", "error", err, "id", request.Id)
 			return oapi.DeleteVolume500JSONResponse{
 				Code:    "internal_error",
 				Message: "failed to delete volume",
@@ -302,5 +302,3 @@ func volumeToOAPI(vol volumes.Volume) oapi.Volume {
 
 	return oapiVol
 }
-
-
diff --git a/cmd/api/main.go b/cmd/api/main.go
index 59413ddf..3951d7b7 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -151,6 +151,13 @@ func run() error {
 		}
 	}
 
+	// Create access logger with OTel handler for HTTP request logging with trace correlation
+	var accessLogHandler slog.Handler
+	if otelProvider != nil {
+		accessLogHandler = otelProvider.LogHandler
+	}
+	accessLogger := mw.NewAccessLogger(accessLogHandler)
+
 	// Load OpenAPI spec for request validation
 	spec, err := oapi.GetSwagger()
 	if err != nil {
@@ -162,11 +169,12 @@ func run() error {
 	spec.Servers = nil
 
 	// Custom exec endpoint (outside OpenAPI spec, uses WebSocket)
+	// Note: No otelchi here as WebSocket doesn't work well with tracing middleware
 	r.With(
 		middleware.RequestID,
 		middleware.RealIP,
-		middleware.Logger,
 		middleware.Recoverer,
+		mw.AccessLogger(accessLogger),
 		mw.JwtAuth(app.Config.JwtSecret),
 	).Get("/instances/{id}/exec", app.ApiService.ExecHandler)
 
@@ -175,13 +183,18 @@ func run() error {
 		// Common middleware
 		r.Use(middleware.RequestID)
 		r.Use(middleware.RealIP)
-		r.Use(middleware.Logger)
 		r.Use(middleware.Recoverer)
 
-		// OpenTelemetry middleware (inside group to avoid breaking WebSocket/SSE on other routes)
+		// OpenTelemetry tracing middleware FIRST (creates span context)
 		if cfg.OtelEnabled {
 			r.Use(otelchi.Middleware(cfg.OtelServiceName, otelchi.WithChiRoutes(r)))
 		}
+
+		// Inject logger into request context for handlers to use
+		r.Use(mw.InjectLogger(accessLogger))
+
+		// Access logger AFTER otelchi so trace context is available
+		r.Use(mw.AccessLogger(accessLogger))
 		if httpMetricsMw != nil {
 			// Skip HTTP metrics for SSE streaming endpoints (logs)
 			r.Use(func(next http.Handler) http.Handler {
diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go
index 823aef08..c77be233 100644
--- a/lib/middleware/otel.go
+++ b/lib/middleware/otel.go
@@ -1,10 +1,13 @@
 package middleware
 
 import (
+	"fmt"
+	"log/slog"
 	"net/http"
 	"time"
 
 	"github.com/go-chi/chi/v5"
+	"github.com/onkernel/hypeman/lib/logger"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 )
@@ -97,3 +100,79 @@ func NoopHTTPMetrics() func(http.Handler) http.Handler {
 		return next
 	}
 }
+
+// AccessLogger returns a middleware that logs HTTP requests using slog with trace context.
+// This replaces chi's middleware.Logger to get logs into OTel/Loki with trace correlation.
+func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			start := time.Now()
+
+			// Wrap response writer to capture status code and bytes
+			wrapped := &accessLogWriter{
+				ResponseWriter: w,
+				statusCode:     http.StatusOK,
+			}
+
+			// Process request
+			next.ServeHTTP(wrapped, r)
+
+			// Get route pattern
+			routePattern := chi.RouteContext(r.Context()).RoutePattern()
+			if routePattern == "" {
+				routePattern = r.URL.Path
+			}
+
+			// Log with trace context from request context
+			duration := time.Since(start)
+			log.InfoContext(r.Context(),
+				fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.statusCode, wrapped.bytesWritten, duration.Milliseconds()),
+				"method", r.Method,
+				"path", routePattern,
+				"status", wrapped.statusCode,
+				"bytes", wrapped.bytesWritten,
+				"duration_ms", duration.Milliseconds(),
+				"remote_addr", r.RemoteAddr,
+			)
+		})
+	}
+}
+
+// accessLogWriter wraps http.ResponseWriter to capture status and bytes.
+type accessLogWriter struct {
+	http.ResponseWriter
+	statusCode   int
+	bytesWritten int
+}
+
+func (w *accessLogWriter) WriteHeader(code int) {
+	w.statusCode = code
+	w.ResponseWriter.WriteHeader(code)
+}
+
+func (w *accessLogWriter) Write(b []byte) (int, error) {
+	n, err := w.ResponseWriter.Write(b)
+	w.bytesWritten += n
+	return n, err
+}
+
+func (w *accessLogWriter) Unwrap() http.ResponseWriter {
+	return w.ResponseWriter
+}
+
+// NewAccessLogger creates an access logger with OTel handler if available.
+func NewAccessLogger(otelHandler slog.Handler) *slog.Logger {
+	cfg := logger.NewConfig()
+	return logger.NewSubsystemLogger(logger.SubsystemAPI, cfg, otelHandler)
+}
+
+// InjectLogger returns middleware that adds the logger to the request context.
+// This enables handlers to use logger.FromContext(ctx) with trace correlation.
+func InjectLogger(log *slog.Logger) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			ctx := logger.AddToContext(r.Context(), log)
+			next.ServeHTTP(w, r.WithContext(ctx))
+		})
+	}
+}

From d8d45e0296103a950292ed1ca61915a7272a134f Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 16:12:56 -0500
Subject: [PATCH 13/14] Use logging context where we can

---
 lib/exec/client.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/exec/client.go b/lib/exec/client.go
index 4ca9fd40..a32e6338 100644
--- a/lib/exec/client.go
+++ b/lib/exec/client.go
@@ -203,7 +203,7 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti
 
 // dialVsock connects to Cloud Hypervisor's vsock Unix socket and performs the handshake
 func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) {
-	slog.Debug("connecting to vsock", "socket", vsockSocketPath)
+	slog.DebugContext(ctx, "connecting to vsock", "socket", vsockSocketPath)
 
 	// Use dial timeout, respecting context deadline if shorter
 	dialTimeout := vsockDialTimeout
@@ -220,7 +220,7 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) {
 		return nil, fmt.Errorf("dial vsock socket %s: %w", vsockSocketPath, err)
 	}
 
-	slog.Debug("connected to vsock socket, performing handshake", "port", vsockGuestPort)
+	slog.DebugContext(ctx, "connected to vsock socket, performing handshake", "port", vsockGuestPort)
 
 	// Set deadline for handshake
 	if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil {
@@ -255,7 +255,7 @@ func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) {
 		return nil, fmt.Errorf("vsock handshake failed: %s", response)
 	}
 
-	slog.Debug("vsock handshake successful", "response", response)
+	slog.DebugContext(ctx, "vsock handshake successful", "response", response)
 
 	// Return wrapped connection that uses the bufio.Reader
 	// This ensures any bytes buffered during handshake are not lost

From a02971115a2c6268784af04605a7aa45b57c5bed Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Wed, 3 Dec 2025 16:31:49 -0500
Subject: [PATCH 14/14] Simplify

---
 lib/images/manager.go    |  95 ------------
 lib/images/metrics.go    | 103 +++++++++++++
 lib/instances/manager.go | 110 --------------
 lib/instances/metrics.go | 118 +++++++++++++++
 lib/middleware/otel.go   | 126 +++++++++-------
 lib/network/manager.go   |  54 -------
 lib/network/metrics.go   |  61 ++++++++
 lib/otel/metrics.go      | 317 ---------------------------------------
 lib/volumes/manager.go   |  96 ------------
 lib/volumes/metrics.go   | 105 +++++++++++++
 10 files changed, 462 insertions(+), 723 deletions(-)
 create mode 100644 lib/images/metrics.go
 create mode 100644 lib/instances/metrics.go
 create mode 100644 lib/network/metrics.go
 delete mode 100644 lib/otel/metrics.go
 create mode 100644 lib/volumes/metrics.go

diff --git a/lib/images/manager.go b/lib/images/manager.go
index d167c5bd..b637bce5 100644
--- a/lib/images/manager.go
+++ b/lib/images/manager.go
@@ -10,7 +10,6 @@ import (
 	"time"
 
 	"github.com/onkernel/hypeman/lib/paths"
-	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 )
 
@@ -30,12 +29,6 @@ type Manager interface {
 	RecoverInterruptedBuilds()
 }
 
-// Metrics holds the metrics instruments for image operations.
-type Metrics struct {
-	buildDuration metric.Float64Histogram
-	pullsTotal    metric.Int64Counter
-}
-
 type manager struct {
 	paths     *paths.Paths
 	ociClient *ociClient
@@ -73,75 +66,6 @@ func NewManager(p *paths.Paths, maxConcurrentBuilds int, meter metric.Meter) (Ma
 	return m, nil
 }
 
-// newMetrics creates and registers all image metrics.
-func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
-	buildDuration, err := meter.Float64Histogram(
-		"hypeman_images_build_duration_seconds",
-		metric.WithDescription("Time to build an image"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	pullsTotal, err := meter.Int64Counter(
-		"hypeman_images_pulls_total",
-		metric.WithDescription("Total number of image pulls from registries"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	// Register observable gauges for queue length and total images
-	buildQueueLength, err := meter.Int64ObservableGauge(
-		"hypeman_images_build_queue_length",
-		metric.WithDescription("Current number of images in the build queue"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	imagesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_images_total",
-		metric.WithDescription("Total number of cached images"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = meter.RegisterCallback(
-		func(ctx context.Context, o metric.Observer) error {
-			// Report queue length
-			o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength()))
-
-			// Count images by status
-			metas, err := listAllTags(m.paths)
-			if err != nil {
-				return nil
-			}
-			statusCounts := make(map[string]int64)
-			for _, meta := range metas {
-				statusCounts[meta.Status]++
-			}
-			for status, count := range statusCounts {
-				o.ObserveInt64(imagesTotal, count,
-					metric.WithAttributes(attribute.String("status", status)))
-			}
-			return nil
-		},
-		buildQueueLength,
-		imagesTotal,
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Metrics{
-		buildDuration: buildDuration,
-		pullsTotal:    pullsTotal,
-	}, nil
-}
-
 func (m *manager) ListImages(ctx context.Context) ([]Image, error) {
 	metas, err := listAllTags(m.paths)
 	if err != nil {
@@ -307,25 +231,6 @@ func (m *manager) buildImage(ctx context.Context, ref *ResolvedRef) {
 	m.recordBuildMetrics(ctx, buildStart, "success")
 }
 
-// recordBuildMetrics records the build duration metric.
-func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) {
-	if m.metrics == nil {
-		return
-	}
-	duration := time.Since(start).Seconds()
-	m.metrics.buildDuration.Record(ctx, duration,
-		metric.WithAttributes(attribute.String("status", status)))
-}
-
-// recordPullMetrics records the pull counter metric.
-func (m *manager) recordPullMetrics(ctx context.Context, status string) {
-	if m.metrics == nil {
-		return
-	}
-	m.metrics.pullsTotal.Add(ctx, 1,
-		metric.WithAttributes(attribute.String("status", status)))
-}
-
 func (m *manager) updateStatusByDigest(ref *ResolvedRef, status string, err error) {
 	meta, readErr := readMetadata(m.paths, ref.Repository(), ref.DigestHex())
 	if readErr != nil {
diff --git a/lib/images/metrics.go b/lib/images/metrics.go
new file mode 100644
index 00000000..08334ef2
--- /dev/null
+++ b/lib/images/metrics.go
@@ -0,0 +1,103 @@
+package images
+
+import (
+	"context"
+	"time"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Metrics holds the metrics instruments for image operations.
+type Metrics struct {
+	buildDuration metric.Float64Histogram
+	pullsTotal    metric.Int64Counter
+}
+
+// newMetrics creates and registers all image metrics.
+func newMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	buildDuration, err := meter.Float64Histogram(
+		"hypeman_images_build_duration_seconds",
+		metric.WithDescription("Time to build an image"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	pullsTotal, err := meter.Int64Counter(
+		"hypeman_images_pulls_total",
+		metric.WithDescription("Total number of image pulls from registries"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauges for queue length and total images
+	buildQueueLength, err := meter.Int64ObservableGauge(
+		"hypeman_images_build_queue_length",
+		metric.WithDescription("Current number of images in the build queue"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	imagesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_images_total",
+		metric.WithDescription("Total number of cached images"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			// Report queue length
+			o.ObserveInt64(buildQueueLength, int64(m.queue.QueueLength()))
+
+			// Count images by status
+			metas, err := listAllTags(m.paths)
+			if err != nil {
+				return nil
+			}
+			statusCounts := make(map[string]int64)
+			for _, meta := range metas {
+				statusCounts[meta.Status]++
+			}
+			for status, count := range statusCounts {
+				o.ObserveInt64(imagesTotal, count,
+					metric.WithAttributes(attribute.String("status", status)))
+			}
+			return nil
+		},
+		buildQueueLength,
+		imagesTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		buildDuration: buildDuration,
+		pullsTotal:    pullsTotal,
+	}, nil
+}
+
+// recordBuildMetrics records the build duration metric.
+func (m *manager) recordBuildMetrics(ctx context.Context, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	m.metrics.buildDuration.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+}
+
+// recordPullMetrics records the pull counter metric.
+func (m *manager) recordPullMetrics(ctx context.Context, status string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.pullsTotal.Add(ctx, 1,
+		metric.WithAttributes(attribute.String("status", status)))
+}
diff --git a/lib/instances/manager.go b/lib/instances/manager.go
index 68e9b36c..a7050578 100644
--- a/lib/instances/manager.go
+++ b/lib/instances/manager.go
@@ -4,14 +4,12 @@ import (
 	"context"
 	"fmt"
 	"sync"
-	"time"
 
 	"github.com/onkernel/hypeman/lib/images"
 	"github.com/onkernel/hypeman/lib/network"
 	"github.com/onkernel/hypeman/lib/paths"
 	"github.com/onkernel/hypeman/lib/system"
 	"github.com/onkernel/hypeman/lib/volumes"
-	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 	"go.opentelemetry.io/otel/trace"
 )
@@ -39,15 +37,6 @@ type ResourceLimits struct {
 	MaxTotalMemory       int64 // Maximum total memory in bytes across all instances (0 = unlimited)
 }
 
-// Metrics holds the metrics instruments for instance operations.
-type Metrics struct {
-	createDuration   metric.Float64Histogram
-	restoreDuration  metric.Float64Histogram
-	standbyDuration  metric.Float64Histogram
-	stateTransitions metric.Int64Counter
-	tracer           trace.Tracer
-}
-
 type manager struct {
 	paths          *paths.Paths
 	imageManager   images.Manager
@@ -85,105 +74,6 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste
 	return m
 }
 
-// newInstanceMetrics creates and registers all instance metrics.
-func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) {
-	createDuration, err := meter.Float64Histogram(
-		"hypeman_instances_create_duration_seconds",
-		metric.WithDescription("Time to create an instance"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	restoreDuration, err := meter.Float64Histogram(
-		"hypeman_instances_restore_duration_seconds",
-		metric.WithDescription("Time to restore an instance from standby"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	standbyDuration, err := meter.Float64Histogram(
-		"hypeman_instances_standby_duration_seconds",
-		metric.WithDescription("Time to put an instance in standby"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	stateTransitions, err := meter.Int64Counter(
-		"hypeman_instances_state_transitions_total",
-		metric.WithDescription("Total number of instance state transitions"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	// Register observable gauge for instance counts by state
-	instancesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_instances_total",
-		metric.WithDescription("Total number of instances by state"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = meter.RegisterCallback(
-		func(ctx context.Context, o metric.Observer) error {
-			instances, err := m.listInstances(ctx)
-			if err != nil {
-				return nil
-			}
-			stateCounts := make(map[string]int64)
-			for _, inst := range instances {
-				stateCounts[string(inst.State)]++
-			}
-			for state, count := range stateCounts {
-				o.ObserveInt64(instancesTotal, count,
-					metric.WithAttributes(attribute.String("state", state)))
-			}
-			return nil
-		},
-		instancesTotal,
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Metrics{
-		createDuration:   createDuration,
-		restoreDuration:  restoreDuration,
-		standbyDuration:  standbyDuration,
-		stateTransitions: stateTransitions,
-		tracer:           tracer,
-	}, nil
-}
-
-// recordDuration records operation duration.
-func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) {
-	if m.metrics == nil {
-		return
-	}
-	duration := time.Since(start).Seconds()
-	histogram.Record(ctx, duration,
-		metric.WithAttributes(attribute.String("status", status)))
-}
-
-// recordStateTransition records a state transition.
-func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) {
-	if m.metrics == nil {
-		return
-	}
-	m.metrics.stateTransitions.Add(ctx, 1,
-		metric.WithAttributes(
-			attribute.String("from", fromState),
-			attribute.String("to", toState),
-		))
-}
-
 // getInstanceLock returns or creates a lock for a specific instance
 func (m *manager) getInstanceLock(id string) *sync.RWMutex {
 	lock, _ := m.instanceLocks.LoadOrStore(id, &sync.RWMutex{})
diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go
new file mode 100644
index 00000000..3500aa1e
--- /dev/null
+++ b/lib/instances/metrics.go
@@ -0,0 +1,118 @@
+package instances
+
+import (
+	"context"
+	"time"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// Metrics holds the metrics instruments for instance operations.
+type Metrics struct {
+	createDuration   metric.Float64Histogram
+	restoreDuration  metric.Float64Histogram
+	standbyDuration  metric.Float64Histogram
+	stateTransitions metric.Int64Counter
+	tracer           trace.Tracer
+}
+
+// newInstanceMetrics creates and registers all instance metrics.
+func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*Metrics, error) {
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_instances_create_duration_seconds",
+		metric.WithDescription("Time to create an instance"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	restoreDuration, err := meter.Float64Histogram(
+		"hypeman_instances_restore_duration_seconds",
+		metric.WithDescription("Time to restore an instance from standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	standbyDuration, err := meter.Float64Histogram(
+		"hypeman_instances_standby_duration_seconds",
+		metric.WithDescription("Time to put an instance in standby"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	stateTransitions, err := meter.Int64Counter(
+		"hypeman_instances_state_transitions_total",
+		metric.WithDescription("Total number of instance state transitions"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauge for instance counts by state
+	instancesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_instances_total",
+		metric.WithDescription("Total number of instances by state"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			instances, err := m.listInstances(ctx)
+			if err != nil {
+				return nil
+			}
+			stateCounts := make(map[string]int64)
+			for _, inst := range instances {
+				stateCounts[string(inst.State)]++
+			}
+			for state, count := range stateCounts {
+				o.ObserveInt64(instancesTotal, count,
+					metric.WithAttributes(attribute.String("state", state)))
+			}
+			return nil
+		},
+		instancesTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		createDuration:   createDuration,
+		restoreDuration:  restoreDuration,
+		standbyDuration:  standbyDuration,
+		stateTransitions: stateTransitions,
+		tracer:           tracer,
+	}, nil
+}
+
+// recordDuration records operation duration.
+func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	histogram.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+}
+
+// recordStateTransition records a state transition.
+func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.stateTransitions.Add(ctx, 1,
+		metric.WithAttributes(
+			attribute.String("from", fromState),
+			attribute.String("to", toState),
+		))
+}
diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go
index c77be233..cac161c9 100644
--- a/lib/middleware/otel.go
+++ b/lib/middleware/otel.go
@@ -1,8 +1,10 @@
 package middleware
 
 import (
+	"bufio"
 	"fmt"
 	"log/slog"
+	"net"
 	"net/http"
 	"time"
 
@@ -48,11 +50,8 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		start := time.Now()
 
-		// Wrap response writer to capture status code
-		wrapped := &responseWriter{
-			ResponseWriter: w,
-			statusCode:     http.StatusOK,
-		}
+		// Wrap response writer to capture status code and bytes
+		wrapped := wrapResponseWriter(w)
 
 		// Process request
 		next.ServeHTTP(wrapped, r)
@@ -70,7 +69,7 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler {
 		attrs := []attribute.KeyValue{
 			attribute.String("method", r.Method),
 			attribute.String("path", routePattern),
-			attribute.Int("status", wrapped.statusCode),
+			attribute.Int("status", wrapped.Status()),
 		}
 
 		m.requestsTotal.Add(r.Context(), 1, metric.WithAttributes(attrs...))
@@ -78,22 +77,6 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler {
 	})
 }
 
-// responseWriter wraps http.ResponseWriter to capture the status code.
-type responseWriter struct {
-	http.ResponseWriter
-	statusCode int
-}
-
-func (rw *responseWriter) WriteHeader(code int) {
-	rw.statusCode = code
-	rw.ResponseWriter.WriteHeader(code)
-}
-
-// Unwrap provides access to the underlying ResponseWriter for http.ResponseController.
-func (rw *responseWriter) Unwrap() http.ResponseWriter {
-	return rw.ResponseWriter
-}
-
 // NoopHTTPMetrics returns a middleware that does nothing (for when OTel is disabled).
 func NoopHTTPMetrics() func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
@@ -109,10 +92,7 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler {
 			start := time.Now()
 
 			// Wrap response writer to capture status code and bytes
-			wrapped := &accessLogWriter{
-				ResponseWriter: w,
-				statusCode:     http.StatusOK,
-			}
+			wrapped := wrapResponseWriter(w)
 
 			// Process request
 			next.ServeHTTP(wrapped, r)
@@ -126,11 +106,11 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler {
 			// Log with trace context from request context
 			duration := time.Since(start)
 			log.InfoContext(r.Context(),
-				fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.statusCode, wrapped.bytesWritten, duration.Milliseconds()),
+				fmt.Sprintf("%s %s %d %dB %dms", r.Method, routePattern, wrapped.Status(), wrapped.BytesWritten(), duration.Milliseconds()),
 				"method", r.Method,
 				"path", routePattern,
-				"status", wrapped.statusCode,
-				"bytes", wrapped.bytesWritten,
+				"status", wrapped.Status(),
+				"bytes", wrapped.BytesWritten(),
 				"duration_ms", duration.Milliseconds(),
 				"remote_addr", r.RemoteAddr,
 			)
@@ -138,28 +118,6 @@ func AccessLogger(log *slog.Logger) func(http.Handler) http.Handler {
 	}
 }
 
-// accessLogWriter wraps http.ResponseWriter to capture status and bytes.
-type accessLogWriter struct {
-	http.ResponseWriter
-	statusCode   int
-	bytesWritten int
-}
-
-func (w *accessLogWriter) WriteHeader(code int) {
-	w.statusCode = code
-	w.ResponseWriter.WriteHeader(code)
-}
-
-func (w *accessLogWriter) Write(b []byte) (int, error) {
-	n, err := w.ResponseWriter.Write(b)
-	w.bytesWritten += n
-	return n, err
-}
-
-func (w *accessLogWriter) Unwrap() http.ResponseWriter {
-	return w.ResponseWriter
-}
-
 // NewAccessLogger creates an access logger with OTel handler if available.
 func NewAccessLogger(otelHandler slog.Handler) *slog.Logger {
 	cfg := logger.NewConfig()
@@ -176,3 +134,69 @@ func InjectLogger(log *slog.Logger) func(http.Handler) http.Handler {
 		})
 	}
 }
+
+// responseWriter wraps http.ResponseWriter to capture status code and bytes written.
+// It also implements http.Flusher and http.Hijacker when the underlying writer supports them.
+type responseWriter struct {
+	http.ResponseWriter
+	statusCode   int
+	bytesWritten int
+	wroteHeader  bool
+}
+
+// wrapResponseWriter creates a new responseWriter wrapper.
+func wrapResponseWriter(w http.ResponseWriter) *responseWriter {
+	return &responseWriter{
+		ResponseWriter: w,
+		statusCode:     http.StatusOK,
+	}
+}
+
+// Status returns the HTTP status code.
+func (rw *responseWriter) Status() int {
+	return rw.statusCode
+}
+
+// BytesWritten returns the number of bytes written.
+func (rw *responseWriter) BytesWritten() int {
+	return rw.bytesWritten
+}
+
+// WriteHeader captures the status code before calling the underlying WriteHeader.
+func (rw *responseWriter) WriteHeader(code int) {
+	if !rw.wroteHeader {
+		rw.statusCode = code
+		rw.wroteHeader = true
+		rw.ResponseWriter.WriteHeader(code)
+	}
+}
+
+// Write captures bytes written and calls the underlying Write.
+func (rw *responseWriter) Write(b []byte) (int, error) {
+	if !rw.wroteHeader {
+		rw.WriteHeader(http.StatusOK)
+	}
+	n, err := rw.ResponseWriter.Write(b)
+	rw.bytesWritten += n
+	return n, err
+}
+
+// Unwrap provides access to the underlying ResponseWriter for http.ResponseController.
+func (rw *responseWriter) Unwrap() http.ResponseWriter {
+	return rw.ResponseWriter
+}
+
+// Flush implements http.Flusher. It delegates to the underlying writer if supported.
+func (rw *responseWriter) Flush() {
+	if f, ok := rw.ResponseWriter.(http.Flusher); ok {
+		f.Flush()
+	}
+}
+
+// Hijack implements http.Hijacker. It delegates to the underlying writer if supported.
+func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
+	if h, ok := rw.ResponseWriter.(http.Hijacker); ok {
+		return h.Hijack()
+	}
+	return nil, nil, fmt.Errorf("underlying ResponseWriter does not implement http.Hijacker")
+}
diff --git a/lib/network/manager.go b/lib/network/manager.go
index 51e4b13d..a48d3341 100644
--- a/lib/network/manager.go
+++ b/lib/network/manager.go
@@ -9,7 +9,6 @@ import (
 	"github.com/onkernel/hypeman/cmd/api/config"
 	"github.com/onkernel/hypeman/lib/logger"
 	"github.com/onkernel/hypeman/lib/paths"
-	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 )
 
@@ -29,11 +28,6 @@ type Manager interface {
 	NameExists(ctx context.Context, name string) (bool, error)
 }
 
-// Metrics holds the metrics instruments for network operations.
-type Metrics struct {
-	tapOperations metric.Int64Counter
-}
-
 // manager implements the Manager interface
 type manager struct {
 	paths   *paths.Paths
@@ -61,54 +55,6 @@ func NewManager(p *paths.Paths, cfg *config.Config, meter metric.Meter) Manager
 	return m
 }
 
-// newNetworkMetrics creates and registers all network metrics.
-func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
-	tapOperations, err := meter.Int64Counter(
-		"hypeman_network_tap_operations_total",
-		metric.WithDescription("Total number of TAP device operations"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	// Register observable gauge for allocations
-	allocationsTotal, err := meter.Int64ObservableGauge(
-		"hypeman_network_allocations_total",
-		metric.WithDescription("Total number of active network allocations"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = meter.RegisterCallback(
-		func(ctx context.Context, o metric.Observer) error {
-			allocs, err := m.ListAllocations(ctx)
-			if err != nil {
-				return nil
-			}
-			o.ObserveInt64(allocationsTotal, int64(len(allocs)))
-			return nil
-		},
-		allocationsTotal,
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Metrics{
-		tapOperations: tapOperations,
-	}, nil
-}
-
-// recordTAPOperation records a TAP device operation.
-func (m *manager) recordTAPOperation(ctx context.Context, operation string) {
-	if m.metrics == nil {
-		return
-	}
-	m.metrics.tapOperations.Add(ctx, 1,
-		metric.WithAttributes(attribute.String("operation", operation)))
-}
-
 // Initialize initializes the network manager and creates default network.
 // runningInstanceIDs should contain IDs of instances currently running (have active VMM).
 func (m *manager) Initialize(ctx context.Context, runningInstanceIDs []string) error {
diff --git a/lib/network/metrics.go b/lib/network/metrics.go
new file mode 100644
index 00000000..48cbd1c0
--- /dev/null
+++ b/lib/network/metrics.go
@@ -0,0 +1,61 @@
+package network
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Metrics holds the metrics instruments for network operations.
+type Metrics struct {
+	tapOperations metric.Int64Counter
+}
+
+// newNetworkMetrics creates and registers all network metrics.
+func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	tapOperations, err := meter.Int64Counter(
+		"hypeman_network_tap_operations_total",
+		metric.WithDescription("Total number of TAP device operations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauge for allocations
+	allocationsTotal, err := meter.Int64ObservableGauge(
+		"hypeman_network_allocations_total",
+		metric.WithDescription("Total number of active network allocations"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			allocs, err := m.ListAllocations(ctx)
+			if err != nil {
+				return nil
+			}
+			o.ObserveInt64(allocationsTotal, int64(len(allocs)))
+			return nil
+		},
+		allocationsTotal,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		tapOperations: tapOperations,
+	}, nil
+}
+
+// recordTAPOperation records a TAP device operation.
+func (m *manager) recordTAPOperation(ctx context.Context, operation string) {
+	if m.metrics == nil {
+		return
+	}
+	m.metrics.tapOperations.Add(ctx, 1,
+		metric.WithAttributes(attribute.String("operation", operation)))
+}
diff --git a/lib/otel/metrics.go b/lib/otel/metrics.go
deleted file mode 100644
index 6532703d..00000000
--- a/lib/otel/metrics.go
+++ /dev/null
@@ -1,317 +0,0 @@
-package otel
-
-import (
-	"go.opentelemetry.io/otel/metric"
-)
-
-// ImageMetrics holds metrics for the image manager.
-type ImageMetrics struct {
-	BuildQueueLength metric.Int64ObservableGauge
-	BuildDuration    metric.Float64Histogram
-	ImagesTotal      metric.Int64ObservableGauge
-	PullsTotal       metric.Int64Counter
-}
-
-// NewImageMetrics creates metrics for the image manager.
-func NewImageMetrics(meter metric.Meter) (*ImageMetrics, error) {
-	buildQueueLength, err := meter.Int64ObservableGauge(
-		"hypeman_images_build_queue_length",
-		metric.WithDescription("Current number of images in the build queue"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	buildDuration, err := meter.Float64Histogram(
-		"hypeman_images_build_duration_seconds",
-		metric.WithDescription("Time to build an image"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	imagesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_images_total",
-		metric.WithDescription("Total number of cached images"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	pullsTotal, err := meter.Int64Counter(
-		"hypeman_images_pulls_total",
-		metric.WithDescription("Total number of image pulls from registries"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &ImageMetrics{
-		BuildQueueLength: buildQueueLength,
-		BuildDuration:    buildDuration,
-		ImagesTotal:      imagesTotal,
-		PullsTotal:       pullsTotal,
-	}, nil
-}
-
-// InstanceMetrics holds metrics for the instance manager.
-type InstanceMetrics struct {
-	InstancesTotal   metric.Int64ObservableGauge
-	CreateDuration   metric.Float64Histogram
-	RestoreDuration  metric.Float64Histogram
-	StandbyDuration  metric.Float64Histogram
-	StateTransitions metric.Int64Counter
-}
-
-// NewInstanceMetrics creates metrics for the instance manager.
-func NewInstanceMetrics(meter metric.Meter) (*InstanceMetrics, error) {
-	instancesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_instances_total",
-		metric.WithDescription("Total number of instances by state"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	createDuration, err := meter.Float64Histogram(
-		"hypeman_instances_create_duration_seconds",
-		metric.WithDescription("Time to create an instance"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	restoreDuration, err := meter.Float64Histogram(
-		"hypeman_instances_restore_duration_seconds",
-		metric.WithDescription("Time to restore an instance from standby"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	standbyDuration, err := meter.Float64Histogram(
-		"hypeman_instances_standby_duration_seconds",
-		metric.WithDescription("Time to put an instance in standby"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	stateTransitions, err := meter.Int64Counter(
-		"hypeman_instances_state_transitions_total",
-		metric.WithDescription("Total number of instance state transitions"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &InstanceMetrics{
-		InstancesTotal:   instancesTotal,
-		CreateDuration:   createDuration,
-		RestoreDuration:  restoreDuration,
-		StandbyDuration:  standbyDuration,
-		StateTransitions: stateTransitions,
-	}, nil
-}
-
-// NetworkMetrics holds metrics for the network manager.
-type NetworkMetrics struct {
-	AllocationsTotal metric.Int64ObservableGauge
-	TapOperations    metric.Int64Counter
-}
-
-// NewNetworkMetrics creates metrics for the network manager.
-func NewNetworkMetrics(meter metric.Meter) (*NetworkMetrics, error) {
-	allocationsTotal, err := meter.Int64ObservableGauge(
-		"hypeman_network_allocations_total",
-		metric.WithDescription("Total number of active network allocations"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	tapOperations, err := meter.Int64Counter(
-		"hypeman_network_tap_operations_total",
-		metric.WithDescription("Total number of TAP device operations"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &NetworkMetrics{
-		AllocationsTotal: allocationsTotal,
-		TapOperations:    tapOperations,
-	}, nil
-}
-
-// VolumeMetrics holds metrics for the volume manager.
-type VolumeMetrics struct {
-	VolumesTotal   metric.Int64ObservableGauge
-	AllocatedBytes metric.Int64ObservableGauge
-	UsedBytes      metric.Int64ObservableGauge
-	CreateDuration metric.Float64Histogram
-}
-
-// NewVolumeMetrics creates metrics for the volume manager.
-func NewVolumeMetrics(meter metric.Meter) (*VolumeMetrics, error) {
-	volumesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_total",
-		metric.WithDescription("Total number of volumes"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	allocatedBytes, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_allocated_bytes",
-		metric.WithDescription("Total allocated/provisioned volume size in bytes"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	usedBytes, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_used_bytes",
-		metric.WithDescription("Actual disk space consumed by volumes in bytes"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	createDuration, err := meter.Float64Histogram(
-		"hypeman_volumes_create_duration_seconds",
-		metric.WithDescription("Time to create a volume"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &VolumeMetrics{
-		VolumesTotal:   volumesTotal,
-		AllocatedBytes: allocatedBytes,
-		UsedBytes:      usedBytes,
-		CreateDuration: createDuration,
-	}, nil
-}
-
-// ExecMetrics holds metrics for the exec client.
-type ExecMetrics struct {
-	SessionsTotal      metric.Int64Counter
-	Duration           metric.Float64Histogram
-	BytesSentTotal     metric.Int64Counter
-	BytesReceivedTotal metric.Int64Counter
-}
-
-// NewExecMetrics creates metrics for the exec client.
-func NewExecMetrics(meter metric.Meter) (*ExecMetrics, error) {
-	sessionsTotal, err := meter.Int64Counter(
-		"hypeman_exec_sessions_total",
-		metric.WithDescription("Total number of exec sessions"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	duration, err := meter.Float64Histogram(
-		"hypeman_exec_duration_seconds",
-		metric.WithDescription("Exec command duration"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	bytesSentTotal, err := meter.Int64Counter(
-		"hypeman_exec_bytes_sent_total",
-		metric.WithDescription("Total bytes sent to guest (stdin)"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	bytesReceivedTotal, err := meter.Int64Counter(
-		"hypeman_exec_bytes_received_total",
-		metric.WithDescription("Total bytes received from guest (stdout+stderr)"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &ExecMetrics{
-		SessionsTotal:      sessionsTotal,
-		Duration:           duration,
-		BytesSentTotal:     bytesSentTotal,
-		BytesReceivedTotal: bytesReceivedTotal,
-	}, nil
-}
-
-// VMMMetrics holds metrics for the VMM client.
-type VMMMetrics struct {
-	APIDuration    metric.Float64Histogram
-	APIErrorsTotal metric.Int64Counter
-}
-
-// NewVMMMetrics creates metrics for the VMM client.
-func NewVMMMetrics(meter metric.Meter) (*VMMMetrics, error) {
-	apiDuration, err := meter.Float64Histogram(
-		"hypeman_vmm_api_duration_seconds",
-		metric.WithDescription("Cloud Hypervisor API call duration"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	apiErrorsTotal, err := meter.Int64Counter(
-		"hypeman_vmm_api_errors_total",
-		metric.WithDescription("Total number of Cloud Hypervisor API errors"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &VMMMetrics{
-		APIDuration:    apiDuration,
-		APIErrorsTotal: apiErrorsTotal,
-	}, nil
-}
-
-// HTTPMetrics holds metrics for HTTP middleware.
-type HTTPMetrics struct {
-	RequestsTotal   metric.Int64Counter
-	RequestDuration metric.Float64Histogram
-}
-
-// NewHTTPMetrics creates metrics for HTTP middleware.
-func NewHTTPMetrics(meter metric.Meter) (*HTTPMetrics, error) {
-	requestsTotal, err := meter.Int64Counter(
-		"hypeman_http_requests_total",
-		metric.WithDescription("Total number of HTTP requests"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	requestDuration, err := meter.Float64Histogram(
-		"hypeman_http_request_duration_seconds",
-		metric.WithDescription("HTTP request duration"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &HTTPMetrics{
-		RequestsTotal:   requestsTotal,
-		RequestDuration: requestDuration,
-	}, nil
-}
diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go
index ac5b375a..d3a3dfc8 100644
--- a/lib/volumes/manager.go
+++ b/lib/volumes/manager.go
@@ -6,13 +6,11 @@ import (
 	"io"
 	"os"
 	"sync"
-	"syscall"
 	"time"
 
 	"github.com/nrednav/cuid2"
 	"github.com/onkernel/hypeman/lib/images"
 	"github.com/onkernel/hypeman/lib/paths"
-	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 )
 
@@ -37,11 +35,6 @@ type Manager interface {
 	GetVolumePath(id string) string
 }
 
-// Metrics holds the metrics instruments for volume operations.
-type Metrics struct {
-	createDuration metric.Float64Histogram
-}
-
 type manager struct {
 	paths                 *paths.Paths
 	maxTotalVolumeStorage int64    // Maximum total volume storage in bytes (0 = unlimited)
@@ -70,95 +63,6 @@ func NewManager(p *paths.Paths, maxTotalVolumeStorage int64, meter metric.Meter)
 	return m
 }
 
-// newVolumeMetrics creates and registers all volume metrics.
-func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
-	createDuration, err := meter.Float64Histogram(
-		"hypeman_volumes_create_duration_seconds",
-		metric.WithDescription("Time to create a volume"),
-		metric.WithUnit("s"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	// Register observable gauges
-	volumesTotal, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_total",
-		metric.WithDescription("Total number of volumes"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	allocatedBytes, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_allocated_bytes",
-		metric.WithDescription("Total allocated/provisioned volume size in bytes"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	usedBytes, err := meter.Int64ObservableGauge(
-		"hypeman_volumes_used_bytes",
-		metric.WithDescription("Actual disk space consumed by volumes in bytes"),
-		metric.WithUnit("By"),
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = meter.RegisterCallback(
-		func(ctx context.Context, o metric.Observer) error {
-			volumes, err := m.ListVolumes(ctx)
-			if err != nil {
-				return nil
-			}
-
-			o.ObserveInt64(volumesTotal, int64(len(volumes)))
-
-			var totalAllocated, totalUsed int64
-			for _, vol := range volumes {
-				// Allocated = provisioned size in GB
-				totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024
-
-				// Used = actual disk blocks consumed (for sparse files)
-				diskPath := m.paths.VolumeData(vol.Id)
-				if stat, err := os.Stat(diskPath); err == nil {
-					// Get actual blocks used via syscall
-					if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok {
-						totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units
-					}
-				}
-			}
-
-			o.ObserveInt64(allocatedBytes, totalAllocated)
-			o.ObserveInt64(usedBytes, totalUsed)
-			return nil
-		},
-		volumesTotal,
-		allocatedBytes,
-		usedBytes,
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Metrics{
-		createDuration: createDuration,
-	}, nil
-}
-
-// recordCreateDuration records the volume creation duration.
-func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) {
-	if m.metrics == nil {
-		return
-	}
-	duration := time.Since(start).Seconds()
-	m.metrics.createDuration.Record(ctx, duration,
-		metric.WithAttributes(attribute.String("status", status)))
-}
-
 // getVolumeLock returns or creates a lock for a specific volume
 func (m *manager) getVolumeLock(id string) *sync.RWMutex {
 	lock, _ := m.volumeLocks.LoadOrStore(id, &sync.RWMutex{})
diff --git a/lib/volumes/metrics.go b/lib/volumes/metrics.go
new file mode 100644
index 00000000..ad8db491
--- /dev/null
+++ b/lib/volumes/metrics.go
@@ -0,0 +1,105 @@
+package volumes
+
+import (
+	"context"
+	"os"
+	"syscall"
+	"time"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Metrics holds the metrics instruments for volume operations.
+type Metrics struct {
+	createDuration metric.Float64Histogram
+}
+
+// newVolumeMetrics creates and registers all volume metrics.
+func newVolumeMetrics(meter metric.Meter, m *manager) (*Metrics, error) {
+	createDuration, err := meter.Float64Histogram(
+		"hypeman_volumes_create_duration_seconds",
+		metric.WithDescription("Time to create a volume"),
+		metric.WithUnit("s"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Register observable gauges
+	volumesTotal, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_total",
+		metric.WithDescription("Total number of volumes"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	allocatedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_allocated_bytes",
+		metric.WithDescription("Total allocated/provisioned volume size in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	usedBytes, err := meter.Int64ObservableGauge(
+		"hypeman_volumes_used_bytes",
+		metric.WithDescription("Actual disk space consumed by volumes in bytes"),
+		metric.WithUnit("By"),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = meter.RegisterCallback(
+		func(ctx context.Context, o metric.Observer) error {
+			volumes, err := m.ListVolumes(ctx)
+			if err != nil {
+				return nil
+			}
+
+			o.ObserveInt64(volumesTotal, int64(len(volumes)))
+
+			var totalAllocated, totalUsed int64
+			for _, vol := range volumes {
+				// Allocated = provisioned size in GB
+				totalAllocated += int64(vol.SizeGb) * 1024 * 1024 * 1024
+
+				// Used = actual disk blocks consumed (for sparse files)
+				diskPath := m.paths.VolumeData(vol.Id)
+				if stat, err := os.Stat(diskPath); err == nil {
+					// Get actual blocks used via syscall
+					if sysStat, ok := stat.Sys().(*syscall.Stat_t); ok {
+						totalUsed += sysStat.Blocks * 512 // Blocks are in 512-byte units
+					}
+				}
+			}
+
+			o.ObserveInt64(allocatedBytes, totalAllocated)
+			o.ObserveInt64(usedBytes, totalUsed)
+			return nil
+		},
+		volumesTotal,
+		allocatedBytes,
+		usedBytes,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Metrics{
+		createDuration: createDuration,
+	}, nil
+}
+
+// recordCreateDuration records the volume creation duration.
+func (m *manager) recordCreateDuration(ctx context.Context, start time.Time, status string) {
+	if m.metrics == nil {
+		return
+	}
+	duration := time.Since(start).Seconds()
+	m.metrics.createDuration.Record(ctx, duration,
+		metric.WithAttributes(attribute.String("status", status)))
+}