diff --git a/src/otlp/gauges/gpu.rs b/src/otlp/gauges/gpu.rs new file mode 100644 index 0000000..d01096f --- /dev/null +++ b/src/otlp/gauges/gpu.rs @@ -0,0 +1,74 @@ +// Copyright (c) 2024-2025 Optimatist Technology Co., Ltd. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This file is part of PSH. +// +// PSH is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License +// as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// PSH is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License along with Performance Savior Home (PSH). If not, +// see . + +use opentelemetry::{KeyValue, metrics::ObservableGauge}; +use psh_system::gpu::NvidiaHandle; + +impl super::super::Otlp { + pub fn gpu_gauges(&self) -> anyhow::Result> { + let host = self.host.clone(); + let interval = self.interval; + let nvgpu = NvidiaHandle::new(); + + let gauge = self + .meter + .u64_observable_gauge("NvGpuStat") + .with_description("System profile nvgpu statistics.") + .with_callback(move |gauge| { + let Ok(gpustats) = nvgpu.stat(Some(interval)) else { + return; + }; + + for stat in gpustats { + let vals = [ + (stat.irq_num.into(), KeyValue::new("stat", "irq_num")), + ( + stat.temperature.into(), + KeyValue::new("stat", "temperature"), + ), + ( + stat.max_pcie_link_gen.into(), + KeyValue::new("stat", "max_pcie_link_gen"), + ), + ( + stat.memory_info.total, + KeyValue::new("stat", "memory_total"), + ), + (stat.memory_info.used, KeyValue::new("stat", "memory_used")), + ( + stat.utilization_rates.memory.into(), + KeyValue::new("stat", "utilization_rates_memory"), + ), + ( + stat.utilization_rates.gpu.into(), + KeyValue::new("stat", "utilization_rates_gpu"), + ), + ]; + for val in vals.into_iter() { + gauge.observe( + val.0, + &[ + KeyValue::new("host", host.clone()), + KeyValue::new("uuid", stat.uuid.clone()), + KeyValue::new("name", stat.name.clone()), + val.1, + ], + ); + } + } + }) + .build(); + Ok(gauge) + } +} diff --git a/src/otlp/gauges/mod.rs b/src/otlp/gauges/mod.rs index d9d6029..0e9e71e 100644 --- a/src/otlp/gauges/mod.rs +++ b/src/otlp/gauges/mod.rs @@ -14,6 +14,7 @@ pub mod cpu; pub mod disk; +pub mod gpu; pub mod interrupt; pub mod memory; pub mod network; diff --git a/src/otlp/mod.rs b/src/otlp/mod.rs index 6c00ad3..41ccd61 100644 --- a/src/otlp/mod.rs +++ b/src/otlp/mod.rs @@ -93,6 +93,9 @@ impl Otlp { if let Err(e) = self.vmstat_gauges() { tracing::error!("Otlp vmstat: {e}") } + if let Err(e) = self.gpu_gauges() { + tracing::error!("Otlp gpu: {e}") + } loop { tokio::time::sleep(interval).await;