@@ -54,7 +54,7 @@ use mz_ore::task::AbortOnDropHandle;
5454use serde:: Deserialize ;
5555use sha2:: { Digest , Sha256 } ;
5656use tokio:: sync:: { mpsc, oneshot} ;
57- use tracing:: { info, warn} ;
57+ use tracing:: { error , info, warn} ;
5858
5959pub mod cloud_resource_controller;
6060pub mod secrets;
@@ -1423,6 +1423,15 @@ impl OrchestratorWorker {
14231423 . collect ( ) ;
14241424 }
14251425
1426+ /// Usage metrics reported by clusterd processes.
1427+ #[ derive( Deserialize ) ]
1428+ pub ( crate ) struct ClusterdUsage {
1429+ disk_bytes : Option < u64 > ,
1430+ memory_bytes : Option < u64 > ,
1431+ swap_bytes : Option < u64 > ,
1432+ heap_limit : Option < u64 > ,
1433+ }
1434+
14261435 /// Get metrics for a particular service and process, converting them into a sane (i.e., numeric) format.
14271436 ///
14281437 /// Note that we want to keep going even if a lookup fails for whatever reason,
@@ -1435,12 +1444,13 @@ impl OrchestratorWorker {
14351444 ) -> ServiceProcessMetrics {
14361445 let name = format ! ( "{service_name}-{i}" ) ;
14371446
1438- let disk_usage_fut = get_disk_usage ( self_, service_name, i) ;
1439- let ( metrics, disk_usage) =
1440- match futures:: future:: join ( self_. metrics_api . get ( & name) , disk_usage_fut) . await {
1441- ( Ok ( metrics) , Ok ( disk_usage) ) => ( metrics, disk_usage) ,
1447+ let clusterd_usage_fut = get_clusterd_usage ( self_, service_name, i) ;
1448+ let ( metrics, clusterd_usage) =
1449+ match futures:: future:: join ( self_. metrics_api . get ( & name) , clusterd_usage_fut) . await
1450+ {
1451+ ( Ok ( metrics) , Ok ( clusterd_usage) ) => ( metrics, Some ( clusterd_usage) ) ,
14421452 ( Ok ( metrics) , Err ( e) ) => {
1443- warn ! ( "Failed to fetch disk usage for {name}: {e}" ) ;
1453+ warn ! ( "Failed to fetch clusterd usage for {name}: {e}" ) ;
14441454 ( metrics, None )
14451455 }
14461456 ( Err ( e) , _) => {
@@ -1461,56 +1471,58 @@ impl OrchestratorWorker {
14611471 return ServiceProcessMetrics :: default ( ) ;
14621472 } ;
14631473
1464- let cpu = match parse_k8s_quantity ( cpu_str) {
1474+ let mut process_metrics = ServiceProcessMetrics :: default ( ) ;
1475+
1476+ match parse_k8s_quantity ( cpu_str) {
14651477 Ok ( q) => match q. try_to_integer ( -9 , true ) {
1466- Some ( i) => Some ( i) ,
1467- None => {
1468- tracing:: error!( "CPU value {q:? }out of range" ) ;
1469- None
1470- }
1478+ Some ( nano_cores) => process_metrics. cpu_nano_cores = Some ( nano_cores) ,
1479+ None => error ! ( "CPU value {q:?} out of range" ) ,
14711480 } ,
1472- Err ( e) => {
1473- tracing:: error!( "Failed to parse CPU value {cpu_str}: {e}" ) ;
1474- None
1475- }
1476- } ;
1477- let memory = match parse_k8s_quantity ( mem_str) {
1481+ Err ( e) => error ! ( "failed to parse CPU value {cpu_str}: {e}" ) ,
1482+ }
1483+ match parse_k8s_quantity ( mem_str) {
14781484 Ok ( q) => match q. try_to_integer ( 0 , false ) {
1479- Some ( i) => Some ( i) ,
1480- None => {
1481- tracing:: error!( "Memory value {q:?} out of range" ) ;
1482- None
1483- }
1485+ Some ( mem) => process_metrics. memory_bytes = Some ( mem) ,
1486+ None => error ! ( "memory value {q:?} out of range" ) ,
14841487 } ,
1485- Err ( e) => {
1486- tracing:: error!( "Failed to parse memory value {mem_str}: {e}" ) ;
1487- None
1488- }
1489- } ;
1488+ Err ( e) => error ! ( "failed to parse memory value {mem_str}: {e}" ) ,
1489+ }
1490+
1491+ if let Some ( usage) = clusterd_usage {
1492+ // clusterd may report disk usage as either `disk_bytes`, or `swap_bytes`, or both.
1493+ //
1494+ // For now the Console expects the swap size to be reported in `disk_bytes`.
1495+ // Once the Console has been ported to use `heap_bytes`/`heap_limit`, we can
1496+ // simplify things by setting `process_metrics.disk_bytes = usage.disk_bytes`.
1497+ process_metrics. disk_bytes = match ( usage. disk_bytes , usage. swap_bytes ) {
1498+ ( Some ( disk) , Some ( swap) ) => Some ( disk + swap) ,
1499+ ( disk, swap) => disk. or ( swap) ,
1500+ } ;
1501+
1502+ // clusterd may report heap usage as `memory_bytes` and optionally `swap_bytes`.
1503+ // If no `memory_bytes` is reported, we can't know the heap usage.
1504+ process_metrics. heap_bytes = match ( usage. memory_bytes , usage. swap_bytes ) {
1505+ ( Some ( memory) , Some ( swap) ) => Some ( memory + swap) ,
1506+ ( Some ( memory) , None ) => Some ( memory) ,
1507+ ( None , _) => None ,
1508+ } ;
14901509
1491- ServiceProcessMetrics {
1492- cpu_nano_cores : cpu,
1493- memory_bytes : memory,
1494- disk_usage_bytes : disk_usage,
1510+ process_metrics. heap_limit = usage. heap_limit ;
14951511 }
1512+
1513+ process_metrics
14961514 }
14971515
1498- /// Get the current disk usage for a particular service and process.
1516+ /// Get the current usage metrics exposed by a clusterd process.
14991517 ///
1500- /// Disk usage is collected by connecting to a metrics endpoint exposed by the process. The
1501- /// endpoint is assumed to be reachable at the 'internal-http' under the HTTP path
1518+ /// Usage metrics are collected by connecting to a metrics endpoint exposed by the process.
1519+ /// The endpoint is assumed to be reachable at the 'internal-http' under the HTTP path
15021520 /// `/api/usage-metrics`.
1503- async fn get_disk_usage (
1521+ async fn get_clusterd_usage (
15041522 self_ : & OrchestratorWorker ,
15051523 service_name : & str ,
15061524 i : usize ,
1507- ) -> anyhow:: Result < Option < u64 > > {
1508- #[ derive( Deserialize ) ]
1509- pub ( crate ) struct Usage {
1510- disk_bytes : Option < u64 > ,
1511- swap_bytes : Option < u64 > ,
1512- }
1513-
1525+ ) -> anyhow:: Result < ClusterdUsage > {
15141526 let service = self_
15151527 . service_api
15161528 . get ( service_name)
@@ -1542,17 +1554,9 @@ impl OrchestratorWorker {
15421554 . build ( )
15431555 . context ( "error building HTTP client" ) ?;
15441556 let resp = http_client. get ( metrics_url) . send ( ) . await ?;
1545- let Usage {
1546- disk_bytes,
1547- swap_bytes,
1548- } = resp. json ( ) . await ?;
1557+ let usage = resp. json ( ) . await ?;
15491558
1550- let bytes = if let ( Some ( disk) , Some ( swap) ) = ( disk_bytes, swap_bytes) {
1551- Some ( disk + swap)
1552- } else {
1553- disk_bytes. or ( swap_bytes)
1554- } ;
1555- Ok ( bytes)
1559+ Ok ( usage)
15561560 }
15571561
15581562 let ret =
0 commit comments