Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d66a19f
docs(health): add GB200 NVSWITCH telemetry matrix
mkoci Jun 18, 2026
aaf72b4
docs(health): record nv-redfish dependency path
mkoci Jun 18, 2026
8f213f1
docs(health): clarify nv-redfish local patch strategy
mkoci Jun 18, 2026
0fdc4c9
feat(health): collect GB200 NVSwitch telemetry gaps
mkoci Jun 18, 2026
ca733f2
feat(health): rework GB200 NVSwitch telemetry to explicit live-valida…
mkoci Jun 23, 2026
bef554b
feat(health): reclaim 4 NVSwitch cable fault rows via NMX-T
mkoci Jun 23, 2026
7b01512
feat(health): implement 6 string-valued NVSwitch catalog rows
mkoci Jun 23, 2026
f4882fd
feat(health): implement 21 temp-threshold + 8 temp-current rows via N…
mkoci Jun 23, 2026
2e3209b
feat(health): reclaim 5 NVSwitch catalog rows via live gNMI/NVUE-REST…
mkoci Jun 23, 2026
f96a1cc
feat(health): exclude high-cardinality free-text labels from the Prom…
mkoci Jun 25, 2026
4582d0e
refactor(health): struct allowlists, StateSet enum metrics, NMX-T lab…
mkoci Jun 25, 2026
2d2fd69
docs(health): reconcile GB200 matrix + runbook for StateSet/represent…
mkoci Jun 25, 2026
c95393f
Merge remote-tracking branch 'origin/main' into nvswitch_telemetry_gaps
mkoci Jun 25, 2026
ef8f173
feat(health): OTLP metrics export full Prometheus-style names + switc…
mkoci Jun 25, 2026
a08b956
fix(health): NMX-T client accept self-signed certs (fixes builder error)
mkoci Jun 25, 2026
84a4de3
fix(health): gNMI TLS uses tonic native custom verifier (skip-verify)
mkoci Jun 26, 2026
57eb8b7
chore(health): remove temp docs from repo
mkoci Jun 26, 2026
96377c7
Merge remote-tracking branch 'origin/main' into nvswitch_telemetry_gaps
mkoci Jun 26, 2026
f96a515
fix(health): prevent empty labels from propagating. Update example co…
mkoci Jun 26, 2026
7bf26d6
fix(health): default to strict TLS verification. add optional flag in…
mkoci Jun 26, 2026
3151bfc
lint(health): fix
mkoci Jun 26, 2026
52ceca5
chore(health): remove leftover GB200 NVSWITCH matrix generator
mkoci Jun 27, 2026
42781fb
chore(health): fix comment copy
mkoci Jun 27, 2026
ba3f114
fix(health): nmxt cleanup. Fix wasteful label rebuilds
mkoci Jun 27, 2026
0524fd7
chore(health): comment cleanup. fixing labels
mkoci Jun 27, 2026
e293294
fix(health): added back allowlist guard
mkoci Jun 27, 2026
961fd8c
fix(health): remove label dupes
mkoci Jun 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions crates/health/example/config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -222,21 +222,31 @@ system_health_enabled = true
cluster_apps_enabled = true
sdn_partitions_enabled = true
interfaces_enabled = true
platform_environment_temperature_enabled = true
platform_environment_status_enabled = true

# NVUE gNMI streaming collector (switches only, disabled by default).
# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink
# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when
# configured separately) pushes to an OTel Collector.
# NVUE gNMI streaming collector which subscribes to
# gNMI SAMPLE paths (components + interfaces, plus platform_general when
# platform_general_enabled is true) and pushes metrics through the configured
# sinks. gNMI ON_CHANGE targets system-events
[collectors.nvue.gnmi]
# periodic SAMPLE (components, interfaces, and platform_general when
# platform_general_enabled is true)
gnmi_port = 9339
sample_interval = "5m"
request_timeout = "30s"
# gNMI ON_CHANGE subscription for system events
# Keep strict TLS certificate and hostname verification by default. Set true only
# for lab/self-signed NVOS gNMI endpoints where that dangerous bypass is required.
dangerously_skip_tls_verification = false
# streaming ON_CHANGE
system_events_enabled = true

[collectors.nvue.gnmi.paths]
components_enabled = true
interfaces_enabled = true
# Switch-level memory and disk utilization from `/platform-general/state`
# (a singleton, not keyed by interface or component name).
platform_general_enabled = true

# ==============================================================================
# Processors
Expand Down
Loading