envd: collect and record heap metrics

teskje · teskje · commit a13253c5ddb2 · 2025-09-30T17:04:13.000-04:00
This change makes the kubernetes orchestrator collect the new heap
metrics reported by clusterd processes and report them to the
controller, where they are added to the
`mz_cluster_replica_metrics_history` relation.
diff --git a/src/catalog/src/builtin.rs b/src/catalog/src/builtin.rs
@@ -4945,6 +4945,8 @@ pub static MZ_CLUSTER_REPLICA_METRICS: LazyLock<BuiltinView> = LazyLock::new(||
         .with_column("cpu_nano_cores", SqlScalarType::UInt64.nullable(true))
         .with_column("memory_bytes", SqlScalarType::UInt64.nullable(true))
         .with_column("disk_bytes", SqlScalarType::UInt64.nullable(true))
+        .with_column("heap_bytes", SqlScalarType::UInt64.nullable(true))
+        .with_column("heap_limit", SqlScalarType::UInt64.nullable(true))
         .with_key(vec![0, 1])
         .finish(),
     column_comments: BTreeMap::from_iter([
@@ -4964,7 +4966,9 @@ SELECT
     process_id,
     cpu_nano_cores,
     memory_bytes,
-    disk_bytes
+    disk_bytes,
+    heap_bytes,
+    heap_limit
 FROM mz_internal.mz_cluster_replica_metrics_history
 JOIN mz_cluster_replicas r ON r.id = replica_id
 ORDER BY replica_id, process_id, occurred_at DESC",
@@ -8863,6 +8867,7 @@ pub static MZ_CLUSTER_REPLICA_UTILIZATION: LazyLock<BuiltinView> = LazyLock::new
         .with_column("cpu_percent", SqlScalarType::Float64.nullable(true))
         .with_column("memory_percent", SqlScalarType::Float64.nullable(true))
         .with_column("disk_percent", SqlScalarType::Float64.nullable(true))
+        .with_column("heap_percent", SqlScalarType::Float64.nullable(true))
         .finish(),
     column_comments: BTreeMap::from_iter([
         ("replica_id", "The ID of a cluster replica."),
@@ -8886,7 +8891,8 @@ SELECT
     m.process_id,
     m.cpu_nano_cores::float8 / NULLIF(s.cpu_nano_cores, 0) * 100 AS cpu_percent,
     m.memory_bytes::float8 / NULLIF(s.memory_bytes, 0) * 100 AS memory_percent,
-    m.disk_bytes::float8 / NULLIF(s.disk_bytes, 0) * 100 AS disk_percent
+    m.disk_bytes::float8 / NULLIF(s.disk_bytes, 0) * 100 AS disk_percent,
+    m.heap_bytes::float8 / NULLIF(m.heap_limit, 0) * 100 AS heap_percent
 FROM
     mz_catalog.mz_cluster_replicas AS r
         JOIN mz_catalog.mz_cluster_replica_sizes AS s ON r.size = s.size
@@ -8905,6 +8911,7 @@ pub static MZ_CLUSTER_REPLICA_UTILIZATION_HISTORY: LazyLock<BuiltinView> =
             .with_column("cpu_percent", SqlScalarType::Float64.nullable(true))
             .with_column("memory_percent", SqlScalarType::Float64.nullable(true))
             .with_column("disk_percent", SqlScalarType::Float64.nullable(true))
+            .with_column("heap_percent", SqlScalarType::Float64.nullable(true))
             .with_column(
                 "occurred_at",
                 SqlScalarType::TimestampTz { precision: None }.nullable(false),
@@ -8937,6 +8944,7 @@ SELECT
     m.cpu_nano_cores::float8 / NULLIF(s.cpu_nano_cores, 0) * 100 AS cpu_percent,
     m.memory_bytes::float8 / NULLIF(s.memory_bytes, 0) * 100 AS memory_percent,
     m.disk_bytes::float8 / NULLIF(s.disk_bytes, 0) * 100 AS disk_percent,
+    m.heap_bytes::float8 / NULLIF(m.heap_limit, 0) * 100 AS heap_percent,
     m.occurred_at
 FROM
     mz_catalog.mz_cluster_replicas AS r
diff --git a/src/controller/src/lib.rs b/src/controller/src/lib.rs
@@ -591,8 +591,10 @@ where
                     Datum::UInt64(u64::cast_from(process_id)),
                     m.cpu_nano_cores.into(),
                     m.memory_bytes.into(),
-                    m.disk_usage_bytes.into(),
+                    m.disk_bytes.into(),
                     Datum::TimestampTz(now_tz),
+                    m.heap_bytes.into(),
+                    m.heap_limit.into(),
                 ]);
                 (row.clone(), mz_repr::Diff::ONE)
             })
diff --git a/src/orchestrator-kubernetes/src/lib.rs b/src/orchestrator-kubernetes/src/lib.rs
@@ -54,7 +54,7 @@ use mz_ore::task::AbortOnDropHandle;
 use serde::Deserialize;
 use sha2::{Digest, Sha256};
 use tokio::sync::{mpsc, oneshot};
-use tracing::{info, warn};
+use tracing::{error, info, warn};
 
 pub mod cloud_resource_controller;
 pub mod secrets;
@@ -1423,6 +1423,15 @@ impl OrchestratorWorker {
                 .collect();
         }
 
+        /// Usage metrics reported by clusterd processes.
+        #[derive(Deserialize)]
+        pub(crate) struct ClusterdUsage {
+            disk_bytes: Option<u64>,
+            memory_bytes: Option<u64>,
+            swap_bytes: Option<u64>,
+            heap_limit: Option<u64>,
+        }
+
         /// Get metrics for a particular service and process, converting them into a sane (i.e., numeric) format.
         ///
         /// Note that we want to keep going even if a lookup fails for whatever reason,
@@ -1435,12 +1444,13 @@ impl OrchestratorWorker {
         ) -> ServiceProcessMetrics {
             let name = format!("{service_name}-{i}");
 
-            let disk_usage_fut = get_disk_usage(self_, service_name, i);
-            let (metrics, disk_usage) =
-                match futures::future::join(self_.metrics_api.get(&name), disk_usage_fut).await {
-                    (Ok(metrics), Ok(disk_usage)) => (metrics, disk_usage),
+            let clusterd_usage_fut = get_clusterd_usage(self_, service_name, i);
+            let (metrics, clusterd_usage) =
+                match futures::future::join(self_.metrics_api.get(&name), clusterd_usage_fut).await
+                {
+                    (Ok(metrics), Ok(clusterd_usage)) => (metrics, Some(clusterd_usage)),
                     (Ok(metrics), Err(e)) => {
-                        warn!("Failed to fetch disk usage for {name}: {e}");
+                        warn!("Failed to fetch clusterd usage for {name}: {e}");
                         (metrics, None)
                     }
                     (Err(e), _) => {
@@ -1461,56 +1471,58 @@ impl OrchestratorWorker {
                 return ServiceProcessMetrics::default();
             };
 
-            let cpu = match parse_k8s_quantity(cpu_str) {
+            let mut process_metrics = ServiceProcessMetrics::default();
+
+            match parse_k8s_quantity(cpu_str) {
                 Ok(q) => match q.try_to_integer(-9, true) {
-                    Some(i) => Some(i),
-                    None => {
-                        tracing::error!("CPU value {q:? }out of range");
-                        None
-                    }
+                    Some(nano_cores) => process_metrics.cpu_nano_cores = Some(nano_cores),
+                    None => error!("CPU value {q:?} out of range"),
                 },
-                Err(e) => {
-                    tracing::error!("Failed to parse CPU value {cpu_str}: {e}");
-                    None
-                }
-            };
-            let memory = match parse_k8s_quantity(mem_str) {
+                Err(e) => error!("failed to parse CPU value {cpu_str}: {e}"),
+            }
+            match parse_k8s_quantity(mem_str) {
                 Ok(q) => match q.try_to_integer(0, false) {
-                    Some(i) => Some(i),
-                    None => {
-                        tracing::error!("Memory value {q:?} out of range");
-                        None
-                    }
+                    Some(mem) => process_metrics.memory_bytes = Some(mem),
+                    None => error!("memory value {q:?} out of range"),
                 },
-                Err(e) => {
-                    tracing::error!("Failed to parse memory value {mem_str}: {e}");
-                    None
-                }
-            };
+                Err(e) => error!("failed to parse memory value {mem_str}: {e}"),
+            }
+
+            if let Some(usage) = clusterd_usage {
+                // clusterd may report disk usage as either `disk_bytes`, or `swap_bytes`, or both.
+                //
+                // For now the Console expects the swap size to be reported in `disk_bytes`.
+                // Once the Console has been ported to use `heap_bytes`/`heap_limit`, we can
+                // simplify things by setting `process_metrics.disk_bytes = usage.disk_bytes`.
+                process_metrics.disk_bytes = match (usage.disk_bytes, usage.swap_bytes) {
+                    (Some(disk), Some(swap)) => Some(disk + swap),
+                    (disk, swap) => disk.or(swap),
+                };
+
+                // clusterd may report heap usage as `memory_bytes` and optionally `swap_bytes`.
+                // If no `memory_bytes` is reported, we can't know the heap usage.
+                process_metrics.heap_bytes = match (usage.memory_bytes, usage.swap_bytes) {
+                    (Some(memory), Some(swap)) => Some(memory + swap),
+                    (Some(memory), None) => Some(memory),
+                    (None, _) => None,
+                };
 
-            ServiceProcessMetrics {
-                cpu_nano_cores: cpu,
-                memory_bytes: memory,
-                disk_usage_bytes: disk_usage,
+                process_metrics.heap_limit = usage.heap_limit;
             }
+
+            process_metrics
         }
 
-        /// Get the current disk usage for a particular service and process.
+        /// Get the current usage metrics exposed by a clusterd process.
         ///
-        /// Disk usage is collected by connecting to a metrics endpoint exposed by the process. The
-        /// endpoint is assumed to be reachable at the 'internal-http' under the HTTP path
+        /// Usage metrics are collected by connecting to a metrics endpoint exposed by the process.
+        /// The endpoint is assumed to be reachable at the 'internal-http' under the HTTP path
         /// `/api/usage-metrics`.
-        async fn get_disk_usage(
+        async fn get_clusterd_usage(
             self_: &OrchestratorWorker,
             service_name: &str,
             i: usize,
-        ) -> anyhow::Result<Option<u64>> {
-            #[derive(Deserialize)]
-            pub(crate) struct Usage {
-                disk_bytes: Option<u64>,
-                swap_bytes: Option<u64>,
-            }
-
+        ) -> anyhow::Result<ClusterdUsage> {
             let service = self_
                 .service_api
                 .get(service_name)
@@ -1542,17 +1554,9 @@ impl OrchestratorWorker {
                 .build()
                 .context("error building HTTP client")?;
             let resp = http_client.get(metrics_url).send().await?;
-            let Usage {
-                disk_bytes,
-                swap_bytes,
-            } = resp.json().await?;
+            let usage = resp.json().await?;
 
-            let bytes = if let (Some(disk), Some(swap)) = (disk_bytes, swap_bytes) {
-                Some(disk + swap)
-            } else {
-                disk_bytes.or(swap_bytes)
-            };
-            Ok(bytes)
+            Ok(usage)
         }
 
         let ret =
diff --git a/src/orchestrator-process/src/lib.rs b/src/orchestrator-process/src/lib.rs
@@ -553,8 +553,10 @@ impl OrchestratorWorker {
             metrics.push(ServiceProcessMetrics {
                 cpu_nano_cores,
                 memory_bytes,
-                // Process orchestrator does not support this right now.
-                disk_usage_bytes: None,
+                // Process orchestrator does not support the remaining fields right now.
+                disk_bytes: None,
+                heap_bytes: None,
+                heap_limit: None,
             });
         }
         Ok(metrics)
diff --git a/src/orchestrator/src/lib.rs b/src/orchestrator/src/lib.rs
@@ -145,7 +145,9 @@ pub trait Service: fmt::Debug + Send + Sync {
 pub struct ServiceProcessMetrics {
     pub cpu_nano_cores: Option<u64>,
     pub memory_bytes: Option<u64>,
-    pub disk_usage_bytes: Option<u64>,
+    pub disk_bytes: Option<u64>,
+    pub heap_bytes: Option<u64>,
+    pub heap_limit: Option<u64>,
 }
 
 /// A simple language for describing assertions about a label's existence and value.
diff --git a/src/storage-client/src/healthcheck.rs b/src/storage-client/src/healthcheck.rs
@@ -169,6 +169,8 @@ pub static REPLICA_METRICS_HISTORY_DESC: LazyLock<RelationDesc> = LazyLock::new(
             "occurred_at",
             SqlScalarType::TimestampTz { precision: None }.nullable(false),
         )
+        .with_column("heap_bytes", SqlScalarType::UInt64.nullable(true))
+        .with_column("heap_limit", SqlScalarType::UInt64.nullable(true))
         .finish()
 });
 
diff --git a/test/sqllogictest/mz_catalog_server_index_accounting.slt b/test/sqllogictest/mz_catalog_server_index_accounting.slt
@@ -224,11 +224,15 @@ mz_cluster_replica_history  replica_name
 mz_cluster_replica_history  size
 mz_cluster_replica_metrics  cpu_nano_cores
 mz_cluster_replica_metrics  disk_bytes
+mz_cluster_replica_metrics  heap_bytes
+mz_cluster_replica_metrics  heap_limit
 mz_cluster_replica_metrics  memory_bytes
 mz_cluster_replica_metrics  process_id
 mz_cluster_replica_metrics  replica_id
 mz_cluster_replica_metrics_history  cpu_nano_cores
 mz_cluster_replica_metrics_history  disk_bytes
+mz_cluster_replica_metrics_history  heap_bytes
+mz_cluster_replica_metrics_history  heap_limit
 mz_cluster_replica_metrics_history  memory_bytes
 mz_cluster_replica_metrics_history  occurred_at
 mz_cluster_replica_metrics_history  process_id
diff --git a/test/sqllogictest/system-cluster.slt b/test/sqllogictest/system-cluster.slt
@@ -410,7 +410,7 @@ ORDER BY r.id;
 ----
 Explained Query:
   Finish order_by=[#0{id} asc nulls_last] output=[#0..=#5]
-    Project (#0{id}..=#3{size}, #5{name}, #29)
+    Project (#0{id}..=#3{size}, #5{name}, #31)
       Map (((uint8_to_double(#27{memory_bytes}) / uint8_to_double(case when (0 = uint8_to_numeric(#21{memory_bytes})) then null else #21{memory_bytes} end)) * 100))
         Join on=(#0{id} = #15{id} = #24{replica_id} AND #2{cluster_id} = #4{id} AND #16{size} = #17{size}) type=delta
           ArrangeBy keys=[[#0{id}], [#2{cluster_id}]]

Original file line number	Diff line number	Diff line change
`@@ -169,6 +169,8 @@ pub static REPLICA_METRICS_HISTORY_DESC: LazyLock<RelationDesc> = LazyLock::new(`
`169`	`169`	`"occurred_at",`
`170`	`170`	`SqlScalarType::TimestampTz { precision: None }.nullable(false),`
`171`	`171`	`)`
	`172`	`+ .with_column("heap_bytes", SqlScalarType::UInt64.nullable(true))`
	`173`	`+ .with_column("heap_limit", SqlScalarType::UInt64.nullable(true))`
`172`	`174`	`.finish()`
`173`	`175`	`});`
`174`	`176`