Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2018d03
docs(health): add GB200 NVSWITCH telemetry matrix
mkoci Jun 18, 2026
340d046
docs(health): record nv-redfish dependency path
mkoci Jun 18, 2026
f9105b5
docs(health): clarify nv-redfish local patch strategy
mkoci Jun 18, 2026
4fa9b6f
feat(health): collect GB200 NVSwitch telemetry gaps
mkoci Jun 18, 2026
3db8cb2
feat(health): rework GB200 NVSwitch telemetry to explicit live-valida…
mkoci Jun 23, 2026
acd2a38
feat(health): reclaim 4 NVSwitch cable fault rows via NMX-T
mkoci Jun 23, 2026
eea4bcd
feat(health): implement 6 string-valued NVSwitch catalog rows
mkoci Jun 23, 2026
2bb4bbe
feat(health): implement 21 temp-threshold + 8 temp-current rows via N…
mkoci Jun 23, 2026
f0b4fa2
feat(health): reclaim 5 NVSwitch catalog rows via live gNMI/NVUE-REST…
mkoci Jun 23, 2026
589ab5a
feat(health): exclude high-cardinality free-text labels from the Prom…
mkoci Jun 25, 2026
dedab8e
refactor(health): struct allowlists, StateSet enum metrics, NMX-T lab…
mkoci Jun 25, 2026
f73d4d8
docs(health): reconcile GB200 matrix + runbook for StateSet/represent…
mkoci Jun 25, 2026
fab0c2d
feat(health): OTLP metrics export full Prometheus-style names + switc…
mkoci Jun 25, 2026
a4c48c0
fix(health): NMX-T client accept self-signed certs (fixes builder error)
mkoci Jun 25, 2026
3af9b99
fix(health): gNMI TLS uses tonic native custom verifier (skip-verify)
mkoci Jun 26, 2026
db79da0
chore(health): remove temp docs from repo
mkoci Jun 26, 2026
b12337d
fix(health): prevent empty labels from propagating. Update example co…
mkoci Jun 26, 2026
51e080f
fix(health): default to strict TLS verification. add optional flag in…
mkoci Jun 26, 2026
f758afe
lint(health): fix
mkoci Jun 26, 2026
9b6f4c9
chore(health): remove leftover GB200 NVSWITCH matrix generator
mkoci Jun 27, 2026
2863717
chore(health): fix comment copy
mkoci Jun 27, 2026
ae8df21
fix(health): nmxt cleanup. Fix wasteful label rebuilds
mkoci Jun 27, 2026
a5723d3
chore(health): comment cleanup. fixing labels
mkoci Jun 27, 2026
4a8bcfe
fix(health): added back allowlist guard
mkoci Jun 27, 2026
5eb393d
fix(health): remove label dupes
mkoci Jun 28, 2026
4e7af93
fix(health): add dangerous TLS gate to nmxt as well. Remove status_me…
mkoci Jun 29, 2026
886fd6d
fix(health): address @coderabbit's random nits :/
mkoci Jun 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions crates/health/example/config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,9 @@ logs_state_file = "/tmp/logs_collector_{machine_id}.json"
[collectors.nmxt]
scrape_interval = "1m"
request_timeout = "30s"
# Keep strict TLS certificate verification by default. Set true only when an
# NMX-T HTTPS endpoint uses a certificate that cannot be verified normally.
dangerously_skip_tls_verification = false

[collectors.nvue.rest]
poll_interval = "1m"
Expand All @@ -222,21 +225,32 @@ system_health_enabled = true
cluster_apps_enabled = true
sdn_partitions_enabled = true
interfaces_enabled = true

# NVUE gNMI streaming collector (switches only, disabled by default).
# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink
# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when
# configured separately) pushes to an OTel Collector.
platform_environment_fan_enabled = true
platform_environment_temperature_enabled = true
platform_environment_status_enabled = true
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# NVUE gNMI streaming collector which subscribes to
# gNMI SAMPLE paths (components + interfaces, plus platform_general when
# platform_general_enabled is true) and pushes metrics through the configured
# sinks. gNMI ON_CHANGE targets system-events
[collectors.nvue.gnmi]
# periodic SAMPLE (components, interfaces, and platform_general when
# platform_general_enabled is true)
gnmi_port = 9339
sample_interval = "5m"
request_timeout = "30s"
# gNMI ON_CHANGE subscription for system events
# Keep strict TLS certificate and hostname verification by default. Set true only
# for lab/self-signed NVOS gNMI endpoints where that dangerous bypass is required.
dangerously_skip_tls_verification = false
# streaming ON_CHANGE
system_events_enabled = true

[collectors.nvue.gnmi.paths]
components_enabled = true
interfaces_enabled = true
# Switch-level memory and disk utilization from `/platform-general/state`
# (a singleton, not keyed by interface or component name).
platform_general_enabled = true

# ==============================================================================
# Processors
Expand Down
Loading
Loading