diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 871f9c434a..470a2bc9de 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -222,21 +222,31 @@ system_health_enabled = true cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true +platform_environment_temperature_enabled = true +platform_environment_status_enabled = true -# NVUE gNMI streaming collector (switches only, disabled by default). -# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink -# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when -# configured separately) pushes to an OTel Collector. +# NVUE gNMI streaming collector which subscribes to +# gNMI SAMPLE paths (components + interfaces, plus platform_general when +# platform_general_enabled is true) and pushes metrics through the configured +# sinks. gNMI ON_CHANGE targets system-events [collectors.nvue.gnmi] +# periodic SAMPLE (components, interfaces, and platform_general when +# platform_general_enabled is true) gnmi_port = 9339 sample_interval = "5m" request_timeout = "30s" -# gNMI ON_CHANGE subscription for system events +# Keep strict TLS certificate and hostname verification by default. Set true only +# for lab/self-signed NVOS gNMI endpoints where that dangerous bypass is required. +dangerously_skip_tls_verification = false +# streaming ON_CHANGE system_events_enabled = true [collectors.nvue.gnmi.paths] components_enabled = true interfaces_enabled = true +# Switch-level memory and disk utilization from `/platform-general/state` +# (a singleton, not keyed by interface or component name). +platform_general_enabled = true # ============================================================================== # Processors diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 7f762a2ed8..486424a255 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -16,11 +16,18 @@ */ //! This module collects metrics from NMX-T telemetry endpoints on NVLink switches if the service is enabled. -//! Scrapes HTTP on 9352 (default for NMX-T) - NOT A Redfish collector! -//! Currently scraping for Effective BER, Symbol Errors and Link Down counter. +//! Scrapes HTTP on 9352 (default for NMX-T) +//! +//! Mapping is an EXPLICIT, catalog-row allowlist over the live NMX-T Prometheus scrape (see +//! `NMXT_METRIC_MAP` and `NMXT_LABEL_MAP`). Each NMX-T source name is either: +//! * a numeric **family** -> emitted as one canonical `switch_nmxt` series (`NMXT_METRIC_MAP`), or +//! * an identity/inventory **label dimension** carried on every series -> re-exported as a +//! canonical label, never as a standalone metric (`NMXT_LABEL_MAP`). +//! +//! Source names not on either allowlist are skipped and counted only (never sanitized into telemetry). use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use nv_redfish::core::Bmc; @@ -28,7 +35,7 @@ use nv_redfish::core::Bmc; use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; -use crate::endpoint::{BmcEndpoint, EndpointMetadata}; +use crate::endpoint::BmcEndpoint; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; /// default NMX-T port @@ -37,7 +44,321 @@ const NMXT_PORT: u16 = 9352; /// NMX-T endpoint const NMXT_ENDPOINT: &str = "/xcset/nvlink_domain_telemetry"; -/// Prometheus text -> NmxtMetricSample +/// MetricSample name for NMX-T metrics +const NMXT_METRIC_NAME: &str = "switch_nmxt"; + +#[derive(Debug, PartialEq)] +struct NmxtMetric { + source: &'static str, + metric_type: &'static str, + unit: &'static str, +} + +#[derive(Debug, PartialEq)] +struct NmxtLabel { + source: &'static str, + canonical: &'static str, +} + +const NMXT_METRIC_MAP: &[NmxtMetric] = &[ + NmxtMetric { + source: "Effective_BER", + metric_type: "effective_ber", + unit: "ratio", + }, + NmxtMetric { + source: "Symbol_Errors", + metric_type: "symbol_errors", + unit: "count", + }, // PHY-SYMBOL-ERRORS + NmxtMetric { + source: "Link_Down", + metric_type: "link_down", + unit: "count", + }, + NmxtMetric { + source: "lid", + metric_type: "lid", + unit: "id", + }, // LID + NmxtMetric { + source: "device_hw_rev", + metric_type: "device_hw_rev", + unit: "id", + }, // DEVICE-HARDWARE-REVISION + NmxtMetric { + source: "Advanced_Status_Opcode", + metric_type: "status_opcode", + unit: "code", + }, // STATUS-OPCODE + NmxtMetric { + source: "remote_reason_opcode", + metric_type: "remote_reason_opcode", + unit: "code", + }, // REMOTE-REASON-OPCODE + NmxtMetric { + source: "time_to_link_up_ext_msec", + metric_type: "time_to_link_up", + unit: "milliseconds", + }, // TIME-TO-LINKS-UP + NmxtMetric { + source: "cable_technology", + metric_type: "cable_transmitter_technology", + unit: "code", + }, // CABLE-TRANSMITTER-TECHNOLOGY + NmxtMetric { + source: "rx_power_lane_0", + metric_type: "cable_rx_power_lane0", + unit: "milliwatts", + }, // CABLE-RX-POWER-LANE0 + NmxtMetric { + source: "rx_power_lane_1", + metric_type: "cable_rx_power_lane1", + unit: "milliwatts", + }, // CABLE-RX-POWER-LANE1 + NmxtMetric { + source: "Module_Voltage", + metric_type: "cable_diag_supply_voltage", + unit: "volts", + }, // CABLE-DIAG-SUPPLY-VOLTAGE + NmxtMetric { + source: "link_partner_lid", + metric_type: "link_partner_lid", + unit: "id", + }, // LINK-PARTNER-LID + NmxtMetric { + source: "successful_recovery_events", + metric_type: "link_recovery_success_cnt", + unit: "count", + }, // LINK-RECOVERY-SUCCESS-CNT + NmxtMetric { + source: "total_successful_recovery_events", + metric_type: "total_link_recovery_success_cnt", + unit: "count", + }, // TOTAL-LINK-RECOVERY-SUCCESS-CNT + NmxtMetric { + source: "time_since_last_recovery", + metric_type: "time_since_last_recovery", + unit: "seconds", + }, // TIME-SINCE-LAST-RECOVERY + NmxtMetric { + source: "time_between_last_2_recoveries", + metric_type: "time_btwn_two_recoveries", + unit: "seconds", + }, // TIME-BTWN-TWO-RECOVERIES + NmxtMetric { + source: "last_host_logical_recovery_attempts_count", + metric_type: "recovery_attempts_l1_cnt", + unit: "count", + }, // RECOVERY-ATTEMPTS-L1-CNT + NmxtMetric { + source: "last_host_serdes_feq_attempts_count", + metric_type: "recovery_attempts_l2_cnt", + unit: "count", + }, // RECOVERY-ATTEMPTS-L2-CNT + NmxtMetric { + source: "time_in_last_host_logical_recovery", + metric_type: "recovery_cycle_duration", + unit: "seconds", + }, // RECOVERY-CYCLE-DURATION + NmxtMetric { + source: "time_in_last_host_serdes_feq_recovery", + metric_type: "serdes_recovery_cycle_duration", + unit: "seconds", + }, // SERDES-RECOVERY-CYCLE-DURATION + NmxtMetric { + source: "contain_n_drain_xmit_discards", + metric_type: "contain_drain_xmit_discard", + unit: "count", + }, // CONTAIN-DRAIN-XMIT-DISCARD + NmxtMetric { + source: "contain_n_drain_rcv_discards", + metric_type: "contain_drain_rcv_discard", + unit: "count", + }, // CONTAIN-DRAIN-RCV-DISCARD + NmxtMetric { + source: "Raw_Errors_Lane_2", + metric_type: "raw_err_lane_2", + unit: "count", + }, // RAW-ERR-LANE-2 + NmxtMetric { + source: "Raw_Errors_Lane_3", + metric_type: "raw_err_lane_3", + unit: "count", + }, // RAW-ERR-LANE-3 + NmxtMetric { + source: "tx_cdr_lol", + metric_type: "cable_tx_cdr_lol", + unit: "state", + }, // CABLE-TX-CDR-LOL + NmxtMetric { + source: "rx_cdr_lol", + metric_type: "cable_rx_cdr_lol", + unit: "state", + }, // CABLE-RX-CDR-LOL + NmxtMetric { + source: "tx_los", + metric_type: "cable_tx_los", + unit: "state", + }, // CABLE-TX-LOS + NmxtMetric { + source: "rx_los", + metric_type: "cable_rx_los", + unit: "state", + }, // CABLE-RX-LOS +]; + +const NMXT_LABEL_MAP: &[NmxtLabel] = &[ + NmxtLabel { + source: "FW_Version", + canonical: "net_fw_ver", + }, // NET-FW-VER + NmxtLabel { + source: "sw_serial_number", + canonical: "serial", + }, // SERIAL + NmxtLabel { + source: "Node_GUID", + canonical: "node_guid", + }, // NODE-GUID + NmxtLabel { + source: "port_guid", + canonical: "port_guid", + }, // PORT-GUID + NmxtLabel { + source: "Port_Number", + canonical: "port_num", + }, // PORT-NUMBER + NmxtLabel { + source: "port_label", + canonical: "port_label", + }, // PORT-LABEL + NmxtLabel { + source: "sw_revision", + canonical: "revision", + }, // REVISION + NmxtLabel { + source: "Active_FEC", + canonical: "fec_mode_active", + }, // FEC-MODE-ACTIVE + NmxtLabel { + source: "Device_ID", + canonical: "device_id", + }, // DEVICE-ID + NmxtLabel { + source: "Status_Message", + canonical: "status_message", + }, // STATUS-MESSAGE + NmxtLabel { + source: "local_reason_opcode", + canonical: "local_reason_opcode", + }, // LOCAL-REASON-OPCODE + NmxtLabel { + source: "Cable_PN", + canonical: "cable_part_number", + }, // CABLE-PART-NUMBER + NmxtLabel { + source: "Cable_SN", + canonical: "cable_serial_number", + }, // CABLE-SERIAL-NUMBER + NmxtLabel { + source: "cable_type", + canonical: "cable_type", + }, // CABLE-TYPE + NmxtLabel { + source: "cable_vendor", + canonical: "cable_vendor", + }, // CABLE-VENDOR + NmxtLabel { + source: "cable_length", + canonical: "cable_length", + }, // CABLE-LENGTH + NmxtLabel { + source: "cable_identifier", + canonical: "cable_identifier", + }, // CABLE-IDENTIFIER + NmxtLabel { + source: "vendor_rev", + canonical: "cable_rev", + }, // CABLE-REV + NmxtLabel { + source: "cable_fw_version", + canonical: "cable_fw_version", + }, // CABLE-FW-VERSION + NmxtLabel { + source: "link_partner_description", + canonical: "link_partner_description", + }, // LINK-PARTNER-DESCRIPTION + NmxtLabel { + source: "link_partner_node_guid", + canonical: "link_partner_node_guid", + }, // LINK-PARTNER-NODE-GUID + NmxtLabel { + source: "link_partner_port_num", + canonical: "link_partner_port_num", + }, // LINK-PARTNER-PORT-NUM + NmxtLabel { + source: "device_num_on_tray", + canonical: "device_num", + }, // DEVICE-NUM + NmxtLabel { + source: "board_type", + canonical: "board_type", + }, // BOARD-TYPE + NmxtLabel { + source: "chassis_slot_index", + canonical: "chassis_slot_idx", + }, // CHASSIS-SLOT-IDX + NmxtLabel { + source: "tray_index", + canonical: "tray_idx", + }, // TRAY-IDX + NmxtLabel { + source: "topology_id", + canonical: "topology_id", + }, // TOPOLOGY-ID + NmxtLabel { + source: "chassis_id", + canonical: "chassis_id", + }, // CHASSIS-ID +]; + +fn lookup_nmxt_metric(name: &str) -> Option<&'static NmxtMetric> { + NMXT_METRIC_MAP.iter().find(|m| m.source == name) +} + +/// Parse `Module_Temperature` as a label value (e.g. `"0C"`), never its own numeric +/// line and emit as a gauge with either numeric or `None +fn cable_temp_to_celsius(raw: &str) -> Option { + let trimmed = raw.trim(); + let digits = trimmed.strip_suffix(['C', 'c']).unwrap_or(trimmed).trim(); + digits.parse::().ok() +} + +/// Enum for `down_blame`, emitted as a StateSet (one 0/1 series per state). +const DOWN_BLAME_STATES: &[&str] = &["unknown", "local_phy", "remote_phy"]; + +/// Maps a raw `down_blame` value to its canonical state, case-insensitively; unknown/empty -> "unknown". +fn down_blame_to_state(raw: &str) -> &'static str { + match raw.trim().to_ascii_lowercase().as_str() { + "local_phy" => "local_phy", + "remote_phy" => "remote_phy", + _ => "unknown", + } +} + +fn required_port_num(sample_labels: &HashMap) -> Option<&str> { + sample_labels + .get("Port_Number") + .map(String::as_str) + .filter(|port_num| !port_num.is_empty()) +} + +#[cfg(test)] +fn lookup_nmxt_label(key: &str) -> Option<&'static NmxtLabel> { + NMXT_LABEL_MAP.iter().find(|l| l.source == key) +} + #[derive(Debug, Clone)] struct NmxtMetricSample { name: String, @@ -45,7 +366,6 @@ struct NmxtMetricSample { value: f64, } -/// Parse Prometheus text format metrics from NMX-T endpoint fn parse_prometheus_metrics(body: &str) -> Vec { let mut samples = Vec::new(); @@ -63,15 +383,12 @@ fn parse_prometheus_metrics(body: &str) -> Vec { samples } -/// Parse a single text line fn parse_prometheus_line(line: &str) -> Option { - // find labels let (name_part, rest) = if let Some(brace_pos) = line.find('{') { let name = &line[..brace_pos]; let rest = &line[brace_pos..]; (name, rest) } else { - // no labels let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let name = parts[0]; @@ -108,7 +425,6 @@ fn parse_prometheus_line(line: &str) -> Option { }) } -/// scrape nmxt metrics from a single switch async fn scrape_switch_nmxt_metrics( http_client: &reqwest::Client, switch_ip: &str, @@ -142,10 +458,8 @@ pub struct NmxtCollectorConfig { pub data_sink: Option>, } -/// NMX-T collector for a single switch/endpoint pub struct NmxtCollector { endpoint: Arc, - switch_id: String, http_client: reqwest::Client, event_context: EventContext, data_sink: Option>, @@ -159,15 +473,16 @@ impl PeriodicCollector for NmxtCollector { endpoint: Arc, config: Self::Config, ) -> Result { - let switch_id = match &endpoint.metadata { - Some(EndpointMetadata::Switch(s)) => s.serial.clone(), - _ => endpoint.addr.mac.to_string(), - }; let event_context = EventContext::from_endpoint(endpoint.as_ref(), "nmxt"); let request_timeout = config.nmxt_config.request_timeout; let http_client = reqwest::Client::builder() .timeout(request_timeout) + // NMX-T switch endpoints serve a self-signed cert (same as the NVUE REST + // collector). Accepting invalid certs also avoids a native-root-CA load + // failure at client build time on minimal runtime images without + // ca-certificates, which otherwise surfaces as "builder error". + .danger_accept_invalid_certs(true) .build() .map_err(|e| { HealthError::GenericError(format!("Failed to create HTTP client: {}", e)) @@ -175,7 +490,6 @@ impl PeriodicCollector for NmxtCollector { Ok(Self { endpoint, - switch_id, http_client, event_context, data_sink: config.data_sink, @@ -207,6 +521,22 @@ impl NmxtCollector { } } + /// Builds label set for one `switch_nmxt` series + fn build_labels( + &self, + sample_labels: &HashMap, + ) -> Vec<(Cow<'static, str>, String)> { + let mut labels: Vec<(Cow<'static, str>, String)> = Vec::with_capacity(NMXT_LABEL_MAP.len()); + + for label in NMXT_LABEL_MAP { + if let Some(value) = sample_labels.get(label.source) { + labels.push((Cow::Borrowed(label.canonical), value.clone())); + } + } + + labels + } + async fn scrape_iteration(&self) -> Result<(), HealthError> { let switch_ip = self.endpoint.addr.ip.to_string(); @@ -214,40 +544,94 @@ impl NmxtCollector { self.emit_event(CollectorEvent::MetricCollectionStart); + // Ports already emitted a cable temperature this iteration (one series per port). + let mut cable_temp_ports: HashSet = HashSet::new(); + // Ports already emitted a down_blame StateSet this iteration (one set per port). + let mut down_blame_ports: HashSet = HashSet::new(); + for sample in metrics { let NmxtMetricSample { name, - labels: mut sample_labels, + labels: sample_labels, value, } = sample; - let port_num = sample_labels.remove("Port_Number").unwrap_or_default(); - let node_guid = sample_labels.remove("Node_GUID").unwrap_or_default(); - - let metric_type = match name.as_str() { - "Effective_BER" => "effective_ber", - "Symbol_Errors" => "symbol_errors", - "Link_Down" => "link_down", - _ => continue, + + // `Module_Temperature` rides as a label on lines whose map entry may not be + // collected. Emit before the map check, once per port. + if let Some(celsius) = sample_labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)) + { + let Some(port_num) = required_port_num(&sample_labels) else { + continue; + }; + if cable_temp_ports.insert(port_num.to_string()) { + let labels = self.build_labels(&sample_labels); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("cable_temperature_celsius:{}", port_num), + name: NMXT_METRIC_NAME.to_string(), + metric_type: "cable_temperature_celsius".to_string(), + unit: "celsius".to_string(), + value: celsius, + labels, + context: None, + } + .into(), + )); + } + } + + // `down_blame` is an enum riding as a label; emit per port as a StateSet + if let Some(raw) = sample_labels.get("down_blame") { + let Some(port_num) = required_port_num(&sample_labels) else { + continue; + }; + if down_blame_ports.insert(port_num.to_string()) { + let current = down_blame_to_state(raw); + let base_labels = self.build_labels(&sample_labels); + for state in DOWN_BLAME_STATES { + let mut labels = base_labels.clone(); + labels.push((Cow::Borrowed("state"), (*state).to_string())); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("down_blame:{}:{}", port_num, state), + name: NMXT_METRIC_NAME.to_string(), + metric_type: "down_blame".to_string(), + unit: "state".to_string(), + value: if *state == current { 1.0 } else { 0.0 }, + labels, + context: None, + } + .into(), + )); + } + } + } + + let Some(metric) = lookup_nmxt_metric(&name) else { + continue; + }; + let (metric_type, unit) = (metric.metric_type, metric.unit); + + // Port number anchors the per-series key. + let Some(port_num) = required_port_num(&sample_labels) else { + continue; }; let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); metric_key.push_str(metric_type); metric_key.push(':'); - metric_key.push_str(&port_num); + metric_key.push_str(port_num); - let labels = vec![ - (Cow::Borrowed("switch_id"), self.switch_id.clone()), - (Cow::Borrowed("switch_ip"), switch_ip.clone()), - (Cow::Borrowed("node_guid"), node_guid), - (Cow::Borrowed("port_num"), port_num), - ]; + let labels = self.build_labels(&sample_labels); self.emit_event(CollectorEvent::Metric( MetricSample { key: metric_key, - name: "switch_nmxt".to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: metric_type.to_string(), - unit: "count".to_string(), + unit: unit.to_string(), value, labels, context: None, @@ -304,4 +688,457 @@ Link_Down{Port_Number="1"} 5 let samples = parse_prometheus_metrics(body); assert_eq!(samples.len(), 4); } + + #[test] + fn test_required_port_num_requires_present_non_empty_label() { + let missing = HashMap::new(); + assert_eq!(required_port_num(&missing), None); + + let mut empty = HashMap::new(); + empty.insert("Port_Number".to_string(), String::new()); + assert_eq!(required_port_num(&empty), None); + + let mut present = HashMap::new(); + present.insert("Port_Number".to_string(), "11".to_string()); + assert_eq!(required_port_num(&present), Some("11")); + } + + /// Live NMX-T `lid` series from the Stage-0 GB200 scrape (`nmxt-prometheus.txt`). + const SAMPLE_LID_LINE: &str = r#"lid{Device_ID="GB100", port_label="GPUP10", logical_state="ACT", device_num_on_tray="2", board_type="3", chassis_slot_index="27", tray_index="17", topology_id="128", chassis_id="1820325172739", Active_FEC="Int_KP4_FEC_PLR", link_partner_description="MF0;sw06:N5400_LD/U1", link_partner_node_guid="0x2c5eab0300b6a900", link_partner_port_num="71", cable_vendor="Other", down_blame="Unknown", local_reason_opcode="No_link_down_indication", Node_GUID="0xe1d04a69816f16bc", node_description="GB100 Nvidia Technologies", Port_Number="11", FW_Version="36.2014.1866", Cable_PN="NA", Cable_SN="NA", cable_type="850 nm VCSEL", cable_length="NA", cable_identifier="Backplane", vendor_rev="NA", cable_fw_version="N/A", Module_Temperature="0C", Status_Message="No issue was observed", port_guid="0xe1d04a69816f16c6", sw_serial_number="MT123", sw_revision="A1", remote_reason_opcode="4"} 3093 1781993954087"#; + + #[test] + fn test_nmxt_metric_map_locks_type_and_unit() { + let expected: &[(&str, &str, &str)] = &[ + ("Effective_BER", "effective_ber", "ratio"), + ("Symbol_Errors", "symbol_errors", "count"), + ("Link_Down", "link_down", "count"), + ("lid", "lid", "id"), + ("device_hw_rev", "device_hw_rev", "id"), + ("Advanced_Status_Opcode", "status_opcode", "code"), + ("remote_reason_opcode", "remote_reason_opcode", "code"), + ( + "time_to_link_up_ext_msec", + "time_to_link_up", + "milliseconds", + ), + ("cable_technology", "cable_transmitter_technology", "code"), + ("rx_power_lane_0", "cable_rx_power_lane0", "milliwatts"), + ("rx_power_lane_1", "cable_rx_power_lane1", "milliwatts"), + ("Module_Voltage", "cable_diag_supply_voltage", "volts"), + ("link_partner_lid", "link_partner_lid", "id"), + ( + "successful_recovery_events", + "link_recovery_success_cnt", + "count", + ), + ( + "total_successful_recovery_events", + "total_link_recovery_success_cnt", + "count", + ), + ( + "time_since_last_recovery", + "time_since_last_recovery", + "seconds", + ), + ( + "time_between_last_2_recoveries", + "time_btwn_two_recoveries", + "seconds", + ), + ( + "last_host_logical_recovery_attempts_count", + "recovery_attempts_l1_cnt", + "count", + ), + ( + "last_host_serdes_feq_attempts_count", + "recovery_attempts_l2_cnt", + "count", + ), + ( + "time_in_last_host_logical_recovery", + "recovery_cycle_duration", + "seconds", + ), + ( + "time_in_last_host_serdes_feq_recovery", + "serdes_recovery_cycle_duration", + "seconds", + ), + ( + "contain_n_drain_xmit_discards", + "contain_drain_xmit_discard", + "count", + ), + ( + "contain_n_drain_rcv_discards", + "contain_drain_rcv_discard", + "count", + ), + ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), + ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), + ("tx_cdr_lol", "cable_tx_cdr_lol", "state"), + ("rx_cdr_lol", "cable_rx_cdr_lol", "state"), + ("tx_los", "cable_tx_los", "state"), + ("rx_los", "cable_rx_los", "state"), + ]; + + for (source, metric_type, unit) in expected { + let m = lookup_nmxt_metric(source) + .unwrap_or_else(|| panic!("family `{source}` must be allowlisted")); + assert_eq!( + (m.metric_type, m.unit), + (*metric_type, *unit), + "family `{source}` must map to ({metric_type}, {unit})" + ); + } + // The allowlist must contain exactly these explicit families (no extras, no generic). + assert_eq!(NMXT_METRIC_MAP.len(), expected.len()); + } + + #[test] + fn test_nmxt_label_map_locks_canonical_names() { + let expected: &[(&str, &str)] = &[ + ("FW_Version", "net_fw_ver"), + ("sw_serial_number", "serial"), + ("Node_GUID", "node_guid"), + ("port_guid", "port_guid"), + ("Port_Number", "port_num"), + ("port_label", "port_label"), + ("sw_revision", "revision"), + ("Active_FEC", "fec_mode_active"), + ("Device_ID", "device_id"), + ("Status_Message", "status_message"), + ("local_reason_opcode", "local_reason_opcode"), + ("Cable_PN", "cable_part_number"), + ("Cable_SN", "cable_serial_number"), + ("cable_type", "cable_type"), + ("cable_vendor", "cable_vendor"), + ("cable_length", "cable_length"), + ("cable_identifier", "cable_identifier"), + ("vendor_rev", "cable_rev"), + ("cable_fw_version", "cable_fw_version"), + ("link_partner_description", "link_partner_description"), + ("link_partner_node_guid", "link_partner_node_guid"), + ("link_partner_port_num", "link_partner_port_num"), + ("device_num_on_tray", "device_num"), + ("board_type", "board_type"), + ("chassis_slot_index", "chassis_slot_idx"), + ("tray_index", "tray_idx"), + ("topology_id", "topology_id"), + ("chassis_id", "chassis_id"), + ]; + + for (key, canonical) in expected { + assert_eq!( + lookup_nmxt_label(key).map(|l| l.canonical), + Some(*canonical), + "label `{key}` must map to canonical `{canonical}`" + ); + } + assert_eq!(NMXT_LABEL_MAP.len(), expected.len()); + } + + // Unknown NMX-T source names are not on either allowlist (never sanitized into telemetry). + #[test] + fn test_unknown_nmxt_sources_not_allowlisted() { + // Live-but-blocked families and arbitrary unknowns: all must be rejected. + for unknown in [ + "HiRetransmissionRate", // row 931, not live + "rq_num_wrfe", // row 1706, not live + "rq_num_lle", // row 1707, not live + "sq_num_wrfe", // row 1708, not live + "Chip_Temp", // threshold blocker, not an NMX-T explicit mapping + "totally_made_up_metric", + ] { + assert!( + lookup_nmxt_metric(unknown).is_none(), + "`{unknown}` must not be an allowlisted family" + ); + assert!( + lookup_nmxt_label(unknown).is_none(), + "`{unknown}` must not be an allowlisted label" + ); + } + } + + // End-to-end: a live family line yields one canonical key and re-exported allowlisted labels. + #[test] + fn test_label_map_reexports_identity_dims_from_live_series() { + let sample = parse_prometheus_line(SAMPLE_LID_LINE).expect("parse lid line"); + assert_eq!(sample.name, "lid"); + + // Resolve canonical labels exactly as build_labels would (allowlist-gated). + let mut canonical = HashMap::new(); + for label in NMXT_LABEL_MAP { + if let Some(value) = sample.labels.get(label.source) { + canonical.insert(label.canonical, value.clone()); + } + } + + assert_eq!( + canonical.get("node_guid"), + Some(&"0xe1d04a69816f16bc".to_string()) + ); + assert_eq!( + canonical.get("port_guid"), + Some(&"0xe1d04a69816f16c6".to_string()) + ); + assert_eq!(canonical.get("port_num"), Some(&"11".to_string())); + assert_eq!(canonical.get("port_label"), Some(&"GPUP10".to_string())); + assert_eq!( + canonical.get("net_fw_ver"), + Some(&"36.2014.1866".to_string()) + ); + assert_eq!(canonical.get("serial"), Some(&"MT123".to_string())); + assert_eq!(canonical.get("revision"), Some(&"A1".to_string())); + assert_eq!(canonical.get("device_id"), Some(&"GB100".to_string())); + assert_eq!( + canonical.get("fec_mode_active"), + Some(&"Int_KP4_FEC_PLR".to_string()) + ); + assert_eq!(canonical.get("cable_part_number"), Some(&"NA".to_string())); + // Module_Temperature is no longer a re-exported label; it becomes a numeric metric. + assert!(!canonical.contains_key("cable_temp")); + assert_eq!( + sample + .labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)), + Some(0.0) + ); + assert_eq!( + canonical.get("chassis_id"), + Some(&"1820325172739".to_string()) + ); + assert_eq!( + canonical.get("link_partner_node_guid"), + Some(&"0x2c5eab0300b6a900".to_string()) + ); + + // node_description is present on the series but NOT allowlisted -> not re-exported. + assert!(!canonical.contains_key("node_description")); + } + + #[test] + fn test_down_blame_to_state() { + assert_eq!(down_blame_to_state("Unknown"), "unknown"); + assert_eq!(down_blame_to_state("Local_phy"), "local_phy"); + assert_eq!(down_blame_to_state("Remote_phy"), "remote_phy"); + // Case-insensitive. + assert_eq!(down_blame_to_state("LOCAL_PHY"), "local_phy"); + assert_eq!(down_blame_to_state("remote_PHY"), "remote_phy"); + // Unrecognized / empty -> "unknown". + assert_eq!(down_blame_to_state("garbage"), "unknown"); + assert_eq!(down_blame_to_state(""), "unknown"); + } + + // Two scraped lines for the same port both carry down_blame="Remote_phy": exactly three + // down_blame series (one per state) are emitted for that port, remote_phy=1 the rest=0, + // unit "state", and down_blame is NOT a plain identity label on the emitted series. + #[test] + fn test_down_blame_state_set_once_per_port() { + use std::sync::Mutex as StdMutex; + + use crate::endpoint::test_support::{mac, test_endpoint}; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let endpoint = Arc::new(test_endpoint(mac("00:11:22:33:44:55"))); + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let collector = NmxtCollector { + endpoint: endpoint.clone(), + http_client: reqwest::Client::new(), + event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), + data_sink: Some(sink.clone()), + }; + + // Two distinct families on the SAME port, both carrying down_blame. + let lines = [ + r#"lid{Port_Number="11", down_blame="Remote_phy"} 3093"#, + r#"Effective_BER{Port_Number="11", down_blame="Remote_phy"} 0"#, + ]; + let mut down_blame_ports: HashSet = HashSet::new(); + for line in lines { + let sample = parse_prometheus_line(line).expect("parse line"); + if let Some(raw) = sample.labels.get("down_blame") { + let Some(port_num) = required_port_num(&sample.labels) else { + continue; + }; + if down_blame_ports.insert(port_num.to_string()) { + let current = down_blame_to_state(raw); + for state in DOWN_BLAME_STATES { + let mut labels = collector.build_labels(&sample.labels); + labels.push((Cow::Borrowed("state"), (*state).to_string())); + collector.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("down_blame:{}:{}", port_num, state), + name: NMXT_METRIC_NAME.to_string(), + metric_type: "down_blame".to_string(), + unit: "state".to_string(), + value: if *state == current { 1.0 } else { 0.0 }, + labels, + context: None, + } + .into(), + )); + } + } + } + } + + let samples = sink.samples.lock().unwrap(); + let blame_series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == "down_blame") + .collect(); + assert_eq!( + blame_series.len(), + 3, + "exactly one series per state per port per scrape" + ); + + for s in &blame_series { + assert_eq!(s.name, "switch_nmxt"); + assert_eq!(s.unit, "state"); + let state = s + .labels + .iter() + .find(|(k, _)| k == "state") + .map(|(_, v)| v.as_str()) + .expect("state label present"); + let expected = if state == "remote_phy" { 1.0 } else { 0.0 }; + assert_eq!(s.value, expected, "state `{state}` value"); + // down_blame must not survive as a plain identity label. + assert!( + !s.labels.iter().any(|(k, _)| k == "down_blame"), + "down_blame must not be a re-exported identity label" + ); + } + } + + #[test] + fn test_cable_temp_to_celsius() { + assert_eq!(cable_temp_to_celsius("0C"), Some(0.0)); + assert_eq!(cable_temp_to_celsius("37C"), Some(37.0)); + assert_eq!(cable_temp_to_celsius("37.5C"), Some(37.5)); + assert_eq!(cable_temp_to_celsius("N/A"), None); + assert_eq!(cable_temp_to_celsius(""), None); + assert_eq!(cable_temp_to_celsius("NA"), None); + } + + // Two scraped lines for the same port both carry Module_Temperature: exactly one + // cable_temperature_celsius series is emitted, with the parsed value and no cable_temp label. + #[test] + fn test_cable_temperature_emit_once_per_port() { + use std::sync::Mutex as StdMutex; + + use crate::endpoint::test_support::{mac, test_endpoint}; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let endpoint = Arc::new(test_endpoint(mac("00:11:22:33:44:55"))); + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let collector = NmxtCollector { + endpoint: endpoint.clone(), + http_client: reqwest::Client::new(), + event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), + data_sink: Some(sink.clone()), + }; + + // Two distinct families on the SAME port, both carrying Module_Temperature. + let lines = [ + r#"lid{Port_Number="11", Module_Temperature="37.5C"} 3093"#, + r#"Effective_BER{Port_Number="11", Module_Temperature="37.5C"} 0"#, + ]; + let mut cable_temp_ports: HashSet = HashSet::new(); + for line in lines { + let sample = parse_prometheus_line(line).expect("parse line"); + if let Some(celsius) = sample + .labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)) + { + let Some(port_num) = required_port_num(&sample.labels) else { + continue; + }; + if cable_temp_ports.insert(port_num.to_string()) { + let labels = collector.build_labels(&sample.labels); + collector.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("cable_temperature_celsius:{}", port_num), + name: NMXT_METRIC_NAME.to_string(), + metric_type: "cable_temperature_celsius".to_string(), + unit: "celsius".to_string(), + value: celsius, + labels, + context: None, + } + .into(), + )); + } + } + } + + let samples = sink.samples.lock().unwrap(); + let temp_series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == "cable_temperature_celsius") + .collect(); + assert_eq!( + temp_series.len(), + 1, + "exactly one series per port per scrape" + ); + + let series = temp_series[0]; + assert_eq!(series.name, "switch_nmxt"); + assert_eq!(series.unit, "celsius"); + assert_eq!(series.value, 37.5); + assert_eq!(series.key, "cable_temperature_celsius:11"); + assert!( + !series.labels.iter().any(|(k, _)| k == "cable_temp"), + "identity labels must no longer include cable_temp" + ); + assert!( + series + .labels + .iter() + .any(|(k, v)| k == "port_num" && v == "11"), + "identity labels still carry port_num" + ); + } } diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 3560b81d99..44816caa75 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -18,7 +18,7 @@ use std::time::Duration; use tonic::metadata::MetadataMap; -use tonic::transport::{Channel, Endpoint}; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; use tonic::{Extensions, Request}; use super::proto::g_nmi_client::GNmiClient as TonicGnmiClient; @@ -31,7 +31,7 @@ use crate::HealthError; use crate::config::NvueGnmiPaths; pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { - let mut paths = Vec::with_capacity(2); + let mut paths = Vec::with_capacity(4); if paths_config.components_enabled { paths.push(Path { elem: vec![ @@ -62,6 +62,38 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { ..Default::default() }); } + if paths_config.platform_general_enabled { + // `/platform-general/state` carries the memory and disk + // utilization leaves + paths.push(Path { + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "state".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + // `/platform-general/versions` carries the OS/BMC/EROT + // firmware version leaves + paths.push(Path { + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "versions".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } paths } @@ -73,6 +105,28 @@ pub struct GnmiClient { username: Option, password: Option, request_timeout: Duration, + dangerously_skip_tls_verification: bool, +} + +fn configure_tls_endpoint( + endpoint: Endpoint, + switch_id: &str, + dangerously_skip_tls_verification: bool, +) -> Result { + if !dangerously_skip_tls_verification { + return Ok(endpoint); + } + + // Use tonic's verifier hook (https endpoints get a strict verifier + // otherwise). No roots on ClientTlsConfig — roots + verifier is an error. + endpoint + .tls_config_with_verifier( + ClientTlsConfig::new(), + crate::collectors::nvue::tls::accept_any_cert_verifier(), + ) + .map_err(|e| { + HealthError::GnmiError(format!("switch {switch_id}: invalid gNMI TLS config: {e}")) + }) } impl GnmiClient { @@ -83,6 +137,7 @@ impl GnmiClient { username: Option, password: Option, request_timeout: Duration, + dangerously_skip_tls_verification: bool, ) -> Self { Self { switch_id, @@ -91,6 +146,7 @@ impl GnmiClient { username, password, request_timeout, + dangerously_skip_tls_verification, } } @@ -109,32 +165,34 @@ impl GnmiClient { )) })?; - let endpoint = Endpoint::from(uri) - .connect_timeout(self.request_timeout) - .timeout(self.request_timeout); - - let tls_config = crate::collectors::nvue::tls::self_signed_tls_config(); - let connector = hyper_rustls::HttpsConnectorBuilder::new() - .with_tls_config(tls_config) - .https_only() - .enable_http2() - .build(); - - let channel = endpoint - .connect_with_connector(connector) - .await - .map_err(|e| { - HealthError::GnmiError(format!( - "switch {}: connection failed to {target}: {e}", - self.switch_id - )) - })?; + let endpoint = configure_tls_endpoint( + Endpoint::from(uri), + &self.switch_id, + self.dangerously_skip_tls_verification, + )? + .connect_timeout(self.request_timeout) + .timeout(self.request_timeout); + + let channel = endpoint.connect().await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: connection failed to {target}: {e}", + self.switch_id + )) + })?; - tracing::debug!( - switch_id = %self.switch_id, - target = %target, - "gNMI TLS channel established (skip-verify)" - ); + if self.dangerously_skip_tls_verification { + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established with certificate verification disabled" + ); + } else { + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established" + ); + } Ok(TonicGnmiClient::new(channel)) } @@ -438,10 +496,35 @@ mod tests { assert_eq!(typed_value_to_f64(&val), None); } + #[test] + fn test_gnmi_client_stores_dangerous_tls_skip_flag() { + let strict = GnmiClient::new( + "switch-1".to_string(), + "10.0.0.9", + 9339, + None, + None, + Duration::from_secs(30), + false, + ); + assert!(!strict.dangerously_skip_tls_verification); + + let dangerous = GnmiClient::new( + "switch-1".to_string(), + "10.0.0.9", + 9339, + None, + None, + Duration::from_secs(30), + true, + ); + assert!(dangerous.dangerously_skip_tls_verification); + } + #[test] fn test_nvue_subscribe_paths_all_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); - assert_eq!(paths.len(), 2); + assert_eq!(paths.len(), 4); assert_eq!(paths[0].elem.len(), 2); assert_eq!(paths[0].elem[0].name, "components"); @@ -450,6 +533,14 @@ mod tests { assert_eq!(paths[1].elem.len(), 2); assert_eq!(paths[1].elem[0].name, "interfaces"); assert_eq!(paths[1].elem[1].name, "interface"); + + assert_eq!(paths[2].elem.len(), 2); + assert_eq!(paths[2].elem[0].name, "platform-general"); + assert_eq!(paths[2].elem[1].name, "state"); + + assert_eq!(paths[3].elem.len(), 2); + assert_eq!(paths[3].elem[0].name, "platform-general"); + assert_eq!(paths[3].elem[1].name, "versions"); } #[test] @@ -457,6 +548,7 @@ mod tests { let paths = nvue_subscribe_paths(&NvueGnmiPaths { components_enabled: false, interfaces_enabled: true, + platform_general_enabled: false, }); assert_eq!(paths.len(), 1); assert_eq!(paths[0].elem.len(), 2); @@ -464,11 +556,28 @@ mod tests { assert_eq!(paths[0].elem[1].name, "interface"); } + #[test] + fn test_nvue_subscribe_paths_platform_general_only() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: false, + platform_general_enabled: true, + }); + assert_eq!(paths.len(), 2); + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "platform-general"); + assert_eq!(paths[0].elem[1].name, "state"); + assert_eq!(paths[1].elem.len(), 2); + assert_eq!(paths[1].elem[0].name, "platform-general"); + assert_eq!(paths[1].elem[1].name, "versions"); + } + #[test] fn test_nvue_subscribe_paths_none_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths { components_enabled: false, interfaces_enabled: false, + platform_general_enabled: false, }); assert!(paths.is_empty()); } @@ -499,7 +608,7 @@ mod tests { let prefix = sub_list.prefix.expect("prefix must be set"); assert_eq!(prefix.target, "nvos", "target must be nvos"); - assert_eq!(sub_list.subscription.len(), 2); + assert_eq!(sub_list.subscription.len(), 4); for sub in &sub_list.subscription { assert_eq!( sub.mode, diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 20c06e3854..94f7b80950 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -27,7 +27,6 @@ use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; -/// process NVUE gNMI SAMPLE notifications and emit them as `CollectorEvent::Metric` pub(crate) struct GnmiSampleProcessor { pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, @@ -98,6 +97,10 @@ impl GnmiSampleProcessor { } else if let Some(comp) = find_elem_key_ref(&combined, "component", "name") { entities.insert(("component", comp)); self.process_component_metric(&combined, comp, val); + } else if combined.iter().any(|e| e.name == "platform-general") { + // switch-level singleton: no name key, counted as one entity. + entities.insert(("platform-general", "")); + self.process_platform_general_metric(&combined, val); } } @@ -111,105 +114,261 @@ impl GnmiSampleProcessor { val: &proto::TypedValue, ) { if leaf_matches(elems, &["state", "oper-status"]) { - let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); - self.emit_data_metric( + let current = oper_status_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( "interface_oper_status", - iface_name, - v, - "state", - "interface_name", - iface_name, - ); - } else if leaf_matches(elems, &["state", "counters", "in-errors"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( - "interface_in_errors", - iface_name, - v, - "count", "interface_name", iface_name, + current, + OPER_STATUS_STATES, ); - } else if leaf_matches(elems, &["state", "counters", "out-errors"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( - "interface_out_errors", - iface_name, - v, - "count", + } else if let Some(metric_type) = numeric_interface_leaf(elems) { + match typed_value_to_f64(val) { + Some(v) => self.emit_iface(metric_type.name, iface_name, v, metric_type.unit), + None => debug_unmapped_value(elems, val, metric_type.name), + } + } else if leaf_matches(elems, &["infiniband", "state", "physical-port-state"]) { + let current = physical_port_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_physical_port_state", "interface_name", iface_name, + current, + PHYSICAL_PORT_STATES, ); - } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( - "interface_effective_ber", - iface_name, - v, - "ratio", + } else if leaf_matches(elems, &["infiniband", "state", "logical-port-state"]) { + let current = logical_port_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_logical_port_state", "interface_name", iface_name, + current, + LOGICAL_PORT_STATES, ); - } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( - "interface_symbol_ber", - iface_name, - v, - "ratio", + } else if leaf_matches(elems, &["infiniband", "state", "speed"]) { + match link_speed_to_gbps(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_link_speed_active", iface_name, v, "gbps"), + None => debug_unmapped_value(elems, val, "interface_link_speed_active"), + } + } else if leaf_matches(elems, &["infiniband", "state", "width"]) { + match link_width_to_f64(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_link_width_active", iface_name, v, "lanes"), + None => debug_unmapped_value(elems, val, "interface_link_width_active"), + } + } else if leaf_matches(elems, &["infiniband", "state", "supported-widths"]) { + match link_width_to_f64(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_supported_width", iface_name, v, "lanes"), + None => debug_unmapped_value(elems, val, "interface_supported_width"), + } + } else if leaf_matches(elems, &["phy-diag", "state", "phy-manager-state"]) { + let current = phy_manager_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_phy_manager_state", "interface_name", iface_name, + current, + PHY_MANAGER_STATES, ); - } else if leaf_matches( - elems, - &["phy-diag", "state", "unintentional-link-down-events"], - ) && let Some(v) = typed_value_to_f64(val) + } else if leaf_matches(elems, &["infiniband", "state", "vl-capabilities"]) + && let Some(caps) = typed_value_to_string(val).filter(|s| !s.is_empty()) { - self.emit_data_metric( - "interface_link_down_events", - iface_name, - v, - "count", - "interface_name", + self.emit_iface_info( + "interface_vl_capabilities_info", iface_name, + "vl_capabilities", + &caps, ); } } + fn emit_iface(&self, metric_type: &str, iface_name: &str, value: f64, unit: &str) { + self.emit_data_metric( + metric_type, + iface_name, + value, + unit, + "interface_name", + iface_name, + ); + } + + /// per-interface info-metric: constant `1.0` sample with a string label beside `interface_name`. + fn emit_iface_info( + &self, + metric_type: &str, + iface_name: &str, + info_label_name: &'static str, + info_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let mut key = String::with_capacity(metric_type.len() + 1 + iface_name.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(iface_name); + + let labels = vec![ + (Cow::Borrowed("interface_name"), iface_name.to_string()), + (Cow::Borrowed(info_label_name), info_label_value.to_string()), + ]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "info".to_string(), + value: 1.0, + labels, + context: None, + })), + ); + } + fn process_component_metric( &self, elems: &[&PathElem], comp_name: &str, val: &proto::TypedValue, ) { + // `/components/component` leaves: the `component_name` label + // distinguishes rows that share a leaf (e.g. FAN-STATE and CPU-STATE both resolve + // to `state/oper-status`) if leaf_matches(elems, &["healthz", "state", "status"]) { - let v = component_health_to_f64(typed_value_to_string(val).as_deref()); - self.emit_data_metric( + let current = component_health_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( "component_health_status", - comp_name, - v, - "state", "component_name", comp_name, + current, + COMPONENT_HEALTH_STATES, ); } else if leaf_matches(elems, &["state", "temperature", "instant"]) && let Some(v) = typed_value_to_f64(val) { - self.emit_data_metric( - "component_temperature_celsius", + self.emit_comp("component_temperature_celsius", comp_name, v, "celsius"); + } else if leaf_matches(elems, &["state", "oper-status"]) { + // FAN-STATE (row 966) and CPU-STATE (row 1174) share this leaf. + let current = oper_status_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "component_oper_status", + "component_name", + comp_name, + current, + OPER_STATUS_STATES, + ); + } else if leaf_matches(elems, &["asic", "state", "asic-temp"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_comp( + "component_asic_temperature_celsius", comp_name, v, "celsius", - "component_name", - comp_name, ); + } else if leaf_matches(elems, &["cpu", "utilization", "state", "avg"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_comp("component_cpu_utilization", comp_name, v, "percent"); + } + } + + fn emit_comp(&self, metric_type: &str, comp_name: &str, value: f64, unit: &str) { + self.emit_data_metric( + metric_type, + comp_name, + value, + unit, + "component_name", + comp_name, + ); + } + + fn process_platform_general_metric(&self, elems: &[&PathElem], val: &proto::TypedValue) { + let info: Option<(&str, &'static str)> = if leaf_matches(elems, &["state", "contact"]) { + Some(("platform_contact_info", "contact")) + } else if leaf_matches(elems, &["state", "location"]) { + Some(("platform_location_info", "location")) + } else if leaf_matches(elems, &["state", "platform-name"]) { + Some(("platform_node_description_info", "node_description")) + } else if leaf_matches(elems, &["versions", "state", "nos-version"]) { + Some(("platform_os_version_info", "os_version")) + } else if leaf_matches(elems, &["versions", "state", "fw-version-bmc"]) { + Some(("platform_bmc_version_info", "bmc_version")) + } else if leaf_matches(elems, &["versions", "state", "fw-version-erot"]) { + Some(("platform_erot_version_info", "erot_version")) + } else { + None + }; + if let Some((metric_type, info_label_name)) = info { + if let Some(s) = typed_value_to_string(val).filter(|s| !s.is_empty()) { + self.emit_switch_info(metric_type, info_label_name, &s); + } + return; + } + + let metric_type = if leaf_matches(elems, &["state", "memory-used"]) { + "platform_memory_used" + } else if leaf_matches(elems, &["state", "memory-total-size"]) { + "platform_memory_total" + } else if leaf_matches(elems, &["state", "disk-total-size"]) { + "platform_disk_total" + } else if leaf_matches(elems, &["state", "disk-used"]) { + "platform_disk_used" + } else { + return; + }; + + match typed_value_to_f64(val) { + Some(v) => self.emit_switch(metric_type, v, "bytes"), + None => debug_unmapped_value(elems, val, metric_type), } } + /// switch-level singleton series: no per-entity name, endpoint identity added by PrometheusSink. + fn emit_switch(&self, metric_type: &str, value: f64, unit: &str) { + let Some(sink) = &self.data_sink else { return }; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key: metric_type.to_string(), + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: unit.to_string(), + value, + labels: Vec::new(), + context: None, + })), + ); + } + + /// switch-level info-metric: constant `1.0` sample carrying a single string label. + fn emit_switch_info( + &self, + metric_type: &str, + info_label_name: &'static str, + info_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let labels = vec![(Cow::Borrowed(info_label_name), info_label_value.to_string())]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key: metric_type.to_string(), + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "info".to_string(), + value: 1.0, + labels, + context: None, + })), + ); + } + fn emit_data_metric( &self, metric_type: &str, @@ -226,8 +385,6 @@ impl GnmiSampleProcessor { key.push(':'); key.push_str(entity_id); - // only the domain-specific entity label; endpoint identity (ip, mac, - // serial_number, collector_type) is added by PrometheusSink from EventContext let labels = vec![( Cow::Borrowed(entity_label_name), entity_label_value.to_string(), @@ -246,6 +403,47 @@ impl GnmiSampleProcessor { })), ); } + + /// OpenMetrics StateSet: one `0.0`/`1.0` series per state (current == 1.0), with a `state` + /// label. + fn emit_state_set( + &self, + metric_type: &str, + entity_label_name: &'static str, + entity_id: &str, + current_state: &str, + all_states: &[&'static str], + ) { + let Some(sink) = &self.data_sink else { return }; + + for state in all_states { + let mut key = + String::with_capacity(metric_type.len() + 1 + entity_id.len() + 1 + state.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(entity_id); + key.push(':'); + key.push_str(state); + + let labels = vec![ + (Cow::Borrowed(entity_label_name), entity_id.to_string()), + (Cow::Borrowed("state"), state.to_string()), + ]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "state".to_string(), + value: if *state == current_state { 1.0 } else { 0.0 }, + labels, + context: None, + })), + ); + } + } } fn find_elem_key_ref<'a>( @@ -270,18 +468,490 @@ fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { .all(|(elem, name)| elem.name == *name) } -fn oper_status_to_f64(status: Option<&str>) -> f64 { +struct NumericLeafMapping { + tail: &'static [&'static str], + name: &'static str, + unit: &'static str, +} + +struct NumericLeaf { + name: &'static str, + unit: &'static str, +} + +/// Table-driven dispatch for numeric `/interfaces/interface` leaves. The +/// expected leaf path tail is matched against the live gNMI tree. +fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { + const TABLE: &[NumericLeafMapping] = &[ + // OpenConfig interface counters (`/state/counters/*`) + NumericLeafMapping { + tail: &["state", "counters", "in-errors"], + name: "interface_in_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-errors"], + name: "interface_out_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-discards"], + name: "interface_out_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "in-octets"], + name: "interface_in_octets", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-octets"], + name: "interface_out_octets", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["state", "counters", "in-pkts"], + name: "interface_in_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-pkts"], + name: "interface_out_packets", + unit: "count", + }, + // InfiniBand port counters (`/infiniband/state/counters/port/*`) + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "link-downed"], + name: "interface_link_downed", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "link-error-recovery", + ], + name: "interface_link_error_recovery", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "rcv-remote-phy-errors", + ], + name: "interface_rcv_remote_physical_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "rcv-switch-relay-errors", + ], + name: "interface_rcv_switch_relay_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "rcv-constraints-errors", + ], + name: "interface_rcv_constraint_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "local-link-integrity-errors", + ], + name: "interface_local_link_integrity_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "infiniband", + "state", + "counters", + "port", + "excessive-buffer-overrun", + ], + name: "interface_port_buffer_overrun_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "qp1-dropped"], + name: "interface_qp1_dropped", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "vl15-dropped"], + name: "interface_vl15_dropped", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "xmit-wait"], + name: "interface_port_xmit_wait", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "mtu"], + name: "interface_mtu", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "max-supported-mtus"], + name: "interface_max_supported_mtu", + unit: "bytes", + }, + // phy-diag counters and ratios (`/phy-diag/state/*`) + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber"], + name: "interface_raw_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "effective-ber"], + name: "interface_effective_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "symbol-ber"], + name: "interface_symbol_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber-ch-1"], + name: "interface_raw_ber_lane0", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber-ch-2"], + name: "interface_raw_ber_lane1", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-errors-ch-1"], + name: "interface_phy_raw_errors_lane0", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-errors-ch-2"], + name: "interface_phy_raw_errors_lane1", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "effective-errors"], + name: "interface_phy_effective_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "zero-hist"], + name: "interface_zero_hist", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "phy-received-bits"], + name: "interface_phy_received_bits", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-malformed-packet-errors"], + name: "interface_port_malformed_packet_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-neighbor-mtu-discards"], + name: "interface_port_neighbor_mtu_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-multi-cast-rcv-pkts"], + name: "interface_port_multicast_rcv_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-multi-cast-xmit-pkts"], + name: "interface_port_multicast_xmit_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-uni-cast-rcv-pkts"], + name: "interface_port_unicast_rcv_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-uni-cast-xmit-pkts"], + name: "interface_port_unicast_xmit_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-local-physical-errors"], + name: "interface_port_local_physical_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "sync-header-error-counter"], + name: "interface_sync_header_error_counter", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-dlid-mapping-errors"], + name: "interface_port_dlid_mapping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-vl-mapping-errors"], + name: "interface_port_vl_mapping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-looping-errors"], + name: "interface_port_looping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-inactive-discards"], + name: "interface_port_inactive_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "rq-general-error"], + name: "interface_rq_general_error", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-codes"], + name: "interface_plr_rcv_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-code-err"], + name: "interface_plr_rcv_codes_err", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-uncorrectable-code"], + name: "interface_plr_rcv_uncorrectables_code", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-codes"], + name: "interface_plr_xmit_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-retry-codes"], + name: "interface_plr_xmit_retrys_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-retry-events"], + name: "interface_plr_xmit_retrys_events", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-sync-events"], + name: "interface_plr_sync_events", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "phy-diag", + "state", + "plr-xmit-retry-events-within-t-sec-max", + ], + name: "interface_plr_xmit_retry_codes_within_minute", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-bw-loss-percent"], + name: "interface_plr_bw_loss_percent", + unit: "percent", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "unintentional-link-down-events"], + name: "interface_link_down_events", + unit: "count", + }, + ]; + + // FEC histogram bins 0..=15 -> interface_fec_hist_{n} + if let Some(leaf) = elems.last().map(|e| e.name.as_str()) + && let Some(bin) = leaf.strip_prefix("rs-num-corr-err-bin") + && let Ok(n) = bin.parse::() + && n <= 15 + && leaf_matches(elems, &["phy-diag", "state", leaf]) + { + return Some(NumericLeaf { + name: FEC_HIST_NAMES[n], + unit: "count", + }); + } + + TABLE.iter().find_map(|m| { + leaf_matches(elems, m.tail).then_some(NumericLeaf { + name: m.name, + unit: m.unit, + }) + }) +} + +/// FEC histogram bins 0..=15 +const FEC_HIST_NAMES: [&str; 16] = [ + "interface_fec_hist_0", + "interface_fec_hist_1", + "interface_fec_hist_2", + "interface_fec_hist_3", + "interface_fec_hist_4", + "interface_fec_hist_5", + "interface_fec_hist_6", + "interface_fec_hist_7", + "interface_fec_hist_8", + "interface_fec_hist_9", + "interface_fec_hist_10", + "interface_fec_hist_11", + "interface_fec_hist_12", + "interface_fec_hist_13", + "interface_fec_hist_14", + "interface_fec_hist_15", +]; + +const OPER_STATUS_STATES: &[&str] = &["up", "down"]; + +/// oper-status string -> current StateSet state. "up" when the source reads +/// "up" or "active" else "down". Applies to +/// `interface_oper_status` and `component_oper_status`. +fn oper_status_to_state(status: Option<&str>) -> &'static str { match status { - Some(s) if s.eq_ignore_ascii_case("up") => 1.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("up") || s.eq_ignore_ascii_case("active") => "up", + _ => "down", + } +} + +const PHYSICAL_PORT_STATES: &[&str] = &["up", "down"]; + +/// InfiniBand physical port state enum -> current StateSet state. Values +/// observed live on GB200: `LINK_UP`, `POLLING`, `PORT_CONFIGURATION_TRAINING`. +/// Binary: "up" only when the link is up; polling/training/everything-else is +/// "down". +fn physical_port_to_state(state: Option<&str>) -> &'static str { + match state { + Some(s) if s.eq_ignore_ascii_case("link_up") => "up", + _ => "down", + } +} + +const PHY_MANAGER_STATES: &[&str] = &["up", "down"]; + +/// PHY manager FSM state string -> current StateSet state. The PHY manager +/// reports a dynamic FSM label (e.g. "Active_or_Linkup", "Disabled"), so we +/// match the `active`/`linkup` tokens +fn phy_manager_to_state(state: Option<&str>) -> &'static str { + match state { + Some(s) + if s.split(|c: char| !c.is_ascii_alphanumeric()).any(|tok| { + tok.eq_ignore_ascii_case("active") || tok.eq_ignore_ascii_case("linkup") + }) => + { + "up" + } + _ => "down", + } +} + +const LOGICAL_PORT_STATES: &[&str] = &["active", "down"]; + +/// InfiniBand logical port state enum -> current StateSet state. +/// (e.g. `ACTIVE`, `DOWN`). "active" when the source reads +/// "active", else "down". +fn logical_port_to_state(state: Option<&str>) -> &'static str { + match state { + Some(s) if s.eq_ignore_ascii_case("active") => "active", + _ => "down", + } +} + +/// IB link width -> active lane count. Handles both the single live form +/// ("2X") and the comma-composite the NVOS schema allows for supported-widths +/// ("1X,2X,4X"); each token is parsed as `X` and the maximum lane count is +/// returned. Returns None when no token matches the `X` shape so unknown +/// widths are not exported. +fn link_width_to_f64(width: Option<&str>) -> Option { + let w = width?; + w.split(',') + .filter_map(|tok| { + tok.trim() + .strip_suffix(['X', 'x']) + .and_then(|digits| digits.parse::().ok()) + }) + .reduce(f64::max) +} + +/// IB link speed -> Gbps. GB200 emits bare numeric Gbps; we also accept the +/// suffix forms the schema permits. +fn link_speed_to_gbps(speed: Option<&str>) -> Option { + let s = speed?.trim(); + if s.is_empty() { + return None; + } + // handle Mbit suffix + if let Some(mbps) = s + .strip_suffix("Mb/s") + .or_else(|| s.strip_suffix("Mbps")) + .or_else(|| s.strip_suffix('M')) + { + return mbps.trim().parse::().ok().map(|v| v / 1000.0); } + // "G" Gbps suffix + if let Some(gbps) = s.strip_suffix(['G', 'g']) { + return gbps.trim().parse::().ok(); + } + // base case numeric implicit Gbps + s.parse::().ok() +} + +/// Log when an interface leaf that matched a known mapping but value wasn't caught. +fn debug_unmapped_value(elems: &[&PathElem], val: &proto::TypedValue, metric_type: &str) { + tracing::debug!( + leaf = %leaf_path(elems), + raw = ?typed_value_to_string(val), + metric_type, + "nvue_gnmi SAMPLE: matched leaf but value coercion returned None; dropping" + ); +} + +/// Render the gNMI element tail as a slash path for diagnostics, e.g. +/// "infiniband/state/speed". +fn leaf_path(elems: &[&PathElem]) -> String { + elems + .iter() + .map(|e| e.name.as_str()) + .collect::>() + .join("/") } -fn component_health_to_f64(status: Option<&str>) -> f64 { +const COMPONENT_HEALTH_STATES: &[&str] = &["healthy", "unhealthy", "unknown"]; + +/// component healthz status -> current StateSet state. "healthy"/"unhealthy" +/// else "unknown". +fn component_health_to_state(status: Option<&str>) -> &'static str { match status { - Some(s) if s.eq_ignore_ascii_case("healthy") => 1.0, - Some(s) if s.eq_ignore_ascii_case("unhealthy") => 2.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("healthy") => "healthy", + Some(s) if s.eq_ignore_ascii_case("unhealthy") => "unhealthy", + _ => "unknown", } } @@ -361,18 +1031,21 @@ mod tests { #[test] fn test_oper_status_mapping() { - assert_eq!(oper_status_to_f64(Some("UP")), 1.0); - assert_eq!(oper_status_to_f64(Some("up")), 1.0); - assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); - assert_eq!(oper_status_to_f64(None), 0.0); + assert_eq!(oper_status_to_state(Some("UP")), "up"); + assert_eq!(oper_status_to_state(Some("up")), "up"); + assert_eq!(oper_status_to_state(Some("DOWN")), "down"); + assert_eq!(oper_status_to_state(None), "down"); } #[test] fn test_component_health_mapping() { - assert_eq!(component_health_to_f64(Some("healthy")), 1.0); - assert_eq!(component_health_to_f64(Some("HEALTHY")), 1.0); - assert_eq!(component_health_to_f64(Some("unhealthy")), 2.0); - assert_eq!(component_health_to_f64(None), 0.0); + assert_eq!(component_health_to_state(Some("healthy")), "healthy"); + assert_eq!(component_health_to_state(Some("HEALTHY")), "healthy"); + assert_eq!(component_health_to_state(Some("unhealthy")), "unhealthy"); + assert_eq!(component_health_to_state(Some("UNHEALTHY")), "unhealthy"); + // unrecognized / absent => "unknown" + assert_eq!(component_health_to_state(Some("weird")), "unknown"); + assert_eq!(component_health_to_state(None), "unknown"); } fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { @@ -520,13 +1193,16 @@ mod tests { assert_eq!(count, 1); let events = sink.events.lock().expect("lock poisoned"); - assert_eq!(events.len(), 1); - let (context, event) = &events[0]; - assert_eq!(context.switch_id(), Some(switch_id)); - assert_eq!(context.switch_slot_number(), Some(7)); - assert_eq!(context.switch_tray_index(), Some(3)); - assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - assert!(matches!(event, CollectorEvent::Metric(_))); + // oper-status is a StateSet: one 0/1 series per state ("up"/"down"). + assert_eq!(events.len(), OPER_STATUS_STATES.len()); + // every emitted series preserves the switch-position context. + for (context, event) in events.iter() { + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + assert!(matches!(event, CollectorEvent::Metric(_))); + } } #[test] @@ -862,42 +1538,1350 @@ mod tests { assert_eq!(metrics.stream_errors_total.get(), 0.0); } - #[test] - fn test_process_subscribe_response_update_increments_notification_counter() { - let proc = test_processor(); - let metrics = test_stream_metrics(); - let resp = proto::SubscribeResponse { - response: Some(proto::subscribe_response::Response::Update( - proto::Notification { - timestamp: 0, - prefix: Some(proto::Path { - elem: vec![ - make_path_elem("interfaces", &[]), - make_path_elem("interface", &[("name", "nvl0")]), - ], - ..Default::default() - }), - update: vec![proto::Update { - path: Some(proto::Path { - elem: vec![ - make_path_elem("state", &[]), - make_path_elem("oper-status", &[]), - ], - ..Default::default() - }), - val: Some(make_typed_value_string("UP")), - ..Default::default() - }], + // ---- explicit GB200 mapping coverage ------------------------------------ + + /// Drive a single `/interfaces/interface[name=acp0]/` update and + /// return the one captured `MetricSample`, asserting the producer-level + /// invariants (stream `name`, `collector_type`, `interface_name` label). + fn run_interface_leaf(tail: &[&str], val: proto::TypedValue) -> (MetricSample, EventContext) { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, ..Default::default() - }, - )), + }), + val: Some(val), + ..Default::default() + }], ..Default::default() }; + proc.process_notification(¬ification); - proc.process_subscribe_response(&resp, &metrics); + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + // shared producer invariants for every interface mapping. The + // `interface_name` label is always present as the first (entity) label; + // info-metrics may carry additional info labels after it, so assert the + // first label rather than the exact set. + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!( + sample.labels.first(), + Some(&(Cow::Borrowed("interface_name"), "acp0".to_string())) + ); + (*sample, ctx) + } - assert_eq!(metrics.notifications_received_total.get(), 1.0); - assert_eq!(metrics.monitored_entities.get(), 1.0); - assert_eq!(metrics.stream_errors_total.get(), 0.0); + /// Same as `run_interface_leaf` but for `/components/component[name=...]`. + fn run_component_leaf(comp_name: &str, tail: &[&str], val: proto::TypedValue) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", comp_name)]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed("component_name"), comp_name.to_string())] + ); + *sample + } + + /// Drive a single `/interfaces/interface[name=acp0]/` update and + /// return ALL captured `MetricSample`s. Used for StateSet leaves, which + /// fan a single source value out into one 0/1 series per possible state. + fn run_interface_leaf_all(tail: &[&str], val: proto::TypedValue) -> Vec { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + sink.events + .lock() + .expect("lock poisoned") + .iter() + .map(|(_, event)| { + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + (**sample).clone() + }) + .collect() + } + + /// Same as `run_interface_leaf_all` but for `/components/component[name=...]`. + fn run_component_leaf_all( + comp_name: &str, + tail: &[&str], + val: proto::TypedValue, + ) -> Vec { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", comp_name)]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + sink.events + .lock() + .expect("lock poisoned") + .iter() + .map(|(_, event)| { + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + (**sample).clone() + }) + .collect() + } + + /// Assert OpenMetrics StateSet semantics over a captured fan-out: exactly + /// one 0/1 series per `all_states` entry, each with unit "state", the named + /// entity label present, and a `state` label; the series whose `state` + /// label equals `current` has value 1.0 and every other series is 0.0. + fn assert_state_set( + samples: &[MetricSample], + metric_type: &str, + entity_label: &str, + entity_id: &str, + all_states: &[&str], + current: &str, + ) { + assert_eq!( + samples.len(), + all_states.len(), + "{metric_type}: expected one series per state" + ); + for state in all_states { + let sample = samples + .iter() + .find(|s| s.labels.iter().any(|(k, v)| k == "state" && v == state)) + .unwrap_or_else(|| panic!("{metric_type}: missing series for state {state}")); + assert_eq!(sample.metric_type, metric_type, "state {state}"); + assert_eq!(sample.unit, "state", "state {state}"); + assert_eq!( + sample.value, + if *state == current { 1.0 } else { 0.0 }, + "{metric_type} state {state}: value (current={current})" + ); + assert!( + sample + .labels + .iter() + .any(|(k, v)| k == entity_label && v == entity_id), + "{metric_type} state {state}: missing entity label {entity_label}={entity_id}" + ); + } + } + + #[test] + fn test_interface_numeric_leaf_table_mappings() { + // (leaf tail, expected metric_type, expected unit) + let cases: &[(&[&str], &str, &str)] = &[ + ( + &["state", "counters", "in-errors"], + "interface_in_errors", + "count", + ), + ( + &["state", "counters", "out-errors"], + "interface_out_errors", + "count", + ), + ( + &["state", "counters", "out-discards"], + "interface_out_discards", + "count", + ), + ( + &["state", "counters", "in-octets"], + "interface_in_octets", + "bytes", + ), + ( + &["state", "counters", "out-octets"], + "interface_out_octets", + "bytes", + ), + ( + &["state", "counters", "in-pkts"], + "interface_in_packets", + "count", + ), + ( + &["state", "counters", "out-pkts"], + "interface_out_packets", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "link-downed"], + "interface_link_downed", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "link-error-recovery", + ], + "interface_link_error_recovery", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-remote-phy-errors", + ], + "interface_rcv_remote_physical_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-switch-relay-errors", + ], + "interface_rcv_switch_relay_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-constraints-errors", + ], + "interface_rcv_constraint_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "local-link-integrity-errors", + ], + "interface_local_link_integrity_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "excessive-buffer-overrun", + ], + "interface_port_buffer_overrun_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "qp1-dropped"], + "interface_qp1_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "vl15-dropped"], + "interface_vl15_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "xmit-wait"], + "interface_port_xmit_wait", + "count", + ), + (&["infiniband", "state", "mtu"], "interface_mtu", "bytes"), + ( + &["infiniband", "state", "max-supported-mtus"], + "interface_max_supported_mtu", + "bytes", + ), + ( + &["phy-diag", "state", "raw-ber"], + "interface_raw_ber", + "ratio", + ), + ( + &["phy-diag", "state", "effective-ber"], + "interface_effective_ber", + "ratio", + ), + ( + &["phy-diag", "state", "symbol-ber"], + "interface_symbol_ber", + "ratio", + ), + ( + &["phy-diag", "state", "raw-ber-ch-1"], + "interface_raw_ber_lane0", + "ratio", + ), + ( + &["phy-diag", "state", "raw-ber-ch-2"], + "interface_raw_ber_lane1", + "ratio", + ), + ( + &["phy-diag", "state", "raw-errors-ch-1"], + "interface_phy_raw_errors_lane0", + "count", + ), + ( + &["phy-diag", "state", "raw-errors-ch-2"], + "interface_phy_raw_errors_lane1", + "count", + ), + ( + &["phy-diag", "state", "effective-errors"], + "interface_phy_effective_errors", + "count", + ), + ( + &["phy-diag", "state", "zero-hist"], + "interface_zero_hist", + "count", + ), + ( + &["phy-diag", "state", "phy-received-bits"], + "interface_phy_received_bits", + "count", + ), + ( + &["phy-diag", "state", "port-malformed-packet-errors"], + "interface_port_malformed_packet_errors", + "count", + ), + ( + &["phy-diag", "state", "port-neighbor-mtu-discards"], + "interface_port_neighbor_mtu_discards", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-rcv-pkts"], + "interface_port_multicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-xmit-pkts"], + "interface_port_multicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-rcv-pkts"], + "interface_port_unicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-xmit-pkts"], + "interface_port_unicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-local-physical-errors"], + "interface_port_local_physical_errors", + "count", + ), + ( + &["phy-diag", "state", "sync-header-error-counter"], + "interface_sync_header_error_counter", + "count", + ), + ( + &["phy-diag", "state", "port-dlid-mapping-errors"], + "interface_port_dlid_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-vl-mapping-errors"], + "interface_port_vl_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-looping-errors"], + "interface_port_looping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-inactive-discards"], + "interface_port_inactive_discards", + "count", + ), + ( + &["phy-diag", "state", "rq-general-error"], + "interface_rq_general_error", + "count", + ), + ( + &["phy-diag", "state", "plr-rcv-codes"], + "interface_plr_rcv_codes", + "count", + ), + ( + &["phy-diag", "state", "plr-rcv-code-err"], + "interface_plr_rcv_codes_err", + "count", + ), + ( + &["phy-diag", "state", "plr-rcv-uncorrectable-code"], + "interface_plr_rcv_uncorrectables_code", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-codes"], + "interface_plr_xmit_codes", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-codes"], + "interface_plr_xmit_retrys_codes", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-events"], + "interface_plr_xmit_retrys_events", + "count", + ), + ( + &["phy-diag", "state", "plr-sync-events"], + "interface_plr_sync_events", + "count", + ), + ( + &[ + "phy-diag", + "state", + "plr-xmit-retry-events-within-t-sec-max", + ], + "interface_plr_xmit_retry_codes_within_minute", + "count", + ), + ( + &["phy-diag", "state", "plr-bw-loss-percent"], + "interface_plr_bw_loss_percent", + "percent", + ), + ]; + + for (tail, expected_name, expected_unit) in cases { + let (sample, _) = run_interface_leaf(tail, make_typed_value_uint(7)); + assert_eq!( + &sample.metric_type, expected_name, + "metric_type mismatch for leaf {tail:?}" + ); + assert_eq!( + &sample.unit, expected_unit, + "unit mismatch for leaf {tail:?}" + ); + assert_eq!(sample.value, 7.0, "value mismatch for leaf {tail:?}"); + } + } + + #[test] + fn test_interface_fec_histogram_bins() { + for n in 0u8..=15 { + let leaf = format!("rs-num-corr-err-bin{n}"); + let (sample, _) = + run_interface_leaf(&["phy-diag", "state", &leaf], make_typed_value_uint(11)); + assert_eq!(sample.metric_type, format!("interface_fec_hist_{n}")); + assert_eq!(sample.unit, "count"); + assert_eq!(sample.value, 11.0); + } + } + + #[test] + fn test_interface_ber_parses_scientific_notation() { + // live BER values arrive as scientific-notation strings, e.g. "15E-255" + let (sample, _) = run_interface_leaf( + &["phy-diag", "state", "raw-ber"], + make_typed_value_string("1E-12"), + ); + assert_eq!(sample.metric_type, "interface_raw_ber"); + assert_eq!(sample.unit, "ratio"); + assert!((sample.value - 1e-12).abs() < f64::EPSILON); + } + + #[test] + fn test_interface_physical_port_state_enum() { + // Binary StateSet: only LINK_UP is "up"; polling/training/anything-else + // is "down" (regression: ordinal codes 2/3 collapsed to "down"). + for (raw, current) in [ + ("LINK_UP", "up"), + ("POLLING", "down"), + ("PORT_CONFIGURATION_TRAINING", "down"), + ("SOMETHING_ELSE", "down"), + ] { + let samples = run_interface_leaf_all( + &["infiniband", "state", "physical-port-state"], + make_typed_value_string(raw), + ); + assert_state_set( + &samples, + "interface_physical_port_state", + "interface_name", + "acp0", + PHYSICAL_PORT_STATES, + current, + ); + } + } + + #[test] + fn test_interface_logical_port_state_enum() { + for (raw, current) in [("ACTIVE", "active"), ("DOWN", "down")] { + let samples = run_interface_leaf_all( + &["infiniband", "state", "logical-port-state"], + make_typed_value_string(raw), + ); + assert_state_set( + &samples, + "interface_logical_port_state", + "interface_name", + "acp0", + LOGICAL_PORT_STATES, + current, + ); + } + } + + #[test] + fn test_phy_manager_to_state_helper() { + // token match, case-insensitive: active/linkup => "up" + assert_eq!(phy_manager_to_state(Some("Active_or_Linkup")), "up"); + assert_eq!(phy_manager_to_state(Some("LINKUP")), "up"); + assert_eq!(phy_manager_to_state(Some("active")), "up"); + // anything else => "down" + assert_eq!(phy_manager_to_state(Some("Disabled")), "down"); + assert_eq!(phy_manager_to_state(Some("")), "down"); + assert_eq!(phy_manager_to_state(None), "down"); + // regression: "active" is a substring of these down-states but must NOT + // match as up -- word-boundary token match, not substring. + assert_eq!(phy_manager_to_state(Some("Inactive")), "down"); + assert_eq!(phy_manager_to_state(Some("Deactivated")), "down"); + } + + #[test] + fn test_interface_phy_manager_state_enum() { + // PHY-MANAGER-STATE (row 961): dynamic FSM string emitted as a StateSet. + for (raw, current) in [ + ("Active_or_Linkup", "up"), + ("LINKUP", "up"), + ("Disabled", "down"), + ("", "down"), + // regression for the substring bug: these contain "active" as a + // substring but are down-states. + ("Inactive", "down"), + ("Deactivated", "down"), + ] { + let samples = run_interface_leaf_all( + &["phy-diag", "state", "phy-manager-state"], + make_typed_value_string(raw), + ); + assert_state_set( + &samples, + "interface_phy_manager_state", + "interface_name", + "acp0", + PHY_MANAGER_STATES, + current, + ); + } + } + + #[test] + fn test_interface_oper_status_state_set() { + for (raw, current) in [("UP", "up"), ("active", "up"), ("DOWN", "down")] { + let samples = + run_interface_leaf_all(&["state", "oper-status"], make_typed_value_string(raw)); + assert_state_set( + &samples, + "interface_oper_status", + "interface_name", + "acp0", + OPER_STATUS_STATES, + current, + ); + } + } + + #[test] + fn test_interface_vl_capabilities_info() { + // VL-CAPABILITIES (row 965): non-empty string -> one info sample whose + // information is carried by the `vl_capabilities` label alongside + // `interface_name`. The shared invariant assert in `run_interface_leaf` + // only checks the first (interface_name) label, so assert the full set + // explicitly here. + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "vl-capabilities"], + make_typed_value_string("VL0-VL7"), + ); + assert_eq!(sample.metric_type, "interface_vl_capabilities_info"); + assert_eq!(sample.unit, "info"); + assert_eq!(sample.value, 1.0); + assert_eq!( + sample.labels, + vec![ + (Cow::Borrowed("interface_name"), "acp0".to_string()), + (Cow::Borrowed("vl_capabilities"), "VL0-VL7".to_string()), + ] + ); + } + + #[test] + fn test_interface_vl_capabilities_empty_is_not_exported() { + // An empty vl-capabilities string carries no information and emits nothing. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("infiniband", &[]), + make_path_elem("state", &[]), + make_path_elem("vl-capabilities", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty vl-capabilities must not emit a metric" + ); + } + + #[test] + fn test_interface_link_width_enum() { + let (active, _) = run_interface_leaf( + &["infiniband", "state", "width"], + make_typed_value_string("2X"), + ); + assert_eq!(active.metric_type, "interface_link_width_active"); + assert_eq!(active.unit, "lanes"); + assert_eq!(active.value, 2.0); + + let (supported, _) = run_interface_leaf( + &["infiniband", "state", "supported-widths"], + make_typed_value_string("4X"), + ); + assert_eq!(supported.metric_type, "interface_supported_width"); + assert_eq!(supported.unit, "lanes"); + assert_eq!(supported.value, 4.0); + } + + #[test] + fn test_component_explicit_leaf_mappings() { + // ASIC-TEMP-CURRENT (row 875) + let asic = run_component_leaf( + "ASIC1", + &["asic", "state", "asic-temp"], + make_typed_value_uint(46), + ); + assert_eq!(asic.metric_type, "component_asic_temperature_celsius"); + assert_eq!(asic.unit, "celsius"); + assert_eq!(asic.value, 46.0); + + // CPU-UTIL (row 885) + let cpu = run_component_leaf( + "cpu", + &["cpu", "utilization", "state", "avg"], + make_typed_value_uint(24), + ); + assert_eq!(cpu.metric_type, "component_cpu_utilization"); + assert_eq!(cpu.unit, "percent"); + assert_eq!(cpu.value, 24.0); + } + + #[test] + fn test_component_oper_status_shared_leaf_fan_and_cpu() { + // FAN-STATE (row 966) and CPU-STATE (row 1174) share state/oper-status; + // the component_name label is the only discriminator. Emitted as a + // StateSet (one 0/1 series per state). + let fan = run_component_leaf_all( + "FAN1/1", + &["state", "oper-status"], + make_typed_value_string("ACTIVE"), + ); + assert_state_set( + &fan, + "component_oper_status", + "component_name", + "FAN1/1", + OPER_STATUS_STATES, + "up", + ); + + let cpu = run_component_leaf_all( + "cpu", + &["state", "oper-status"], + make_typed_value_string("DOWN"), + ); + assert_state_set( + &cpu, + "component_oper_status", + "component_name", + "cpu", + OPER_STATUS_STATES, + "down", + ); + } + + #[test] + fn test_component_health_status_state_set() { + // healthz status emitted as a 3-state StateSet; unrecognized => unknown. + for (raw, current) in [ + ("healthy", "healthy"), + ("unhealthy", "unhealthy"), + ("something_weird", "unknown"), + ] { + let samples = run_component_leaf_all( + "ASIC1", + &["healthz", "state", "status"], + make_typed_value_string(raw), + ); + assert_state_set( + &samples, + "component_health_status", + "component_name", + "ASIC1", + COMPONENT_HEALTH_STATES, + current, + ); + } + } + + #[test] + fn test_unknown_interface_leaf_is_not_exported() { + // a live but unmapped leaf (e.g. ip-address, which is not in any + // canonical mapping arm or the numeric table) must never produce a + // MetricSample. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("ip-address", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("10.0.0.1")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unmapped leaf must not emit a metric" + ); + } + + #[test] + fn test_link_width_to_f64_helper() { + assert_eq!(link_width_to_f64(Some("1X")), Some(1.0)); + assert_eq!(link_width_to_f64(Some("2X")), Some(2.0)); + assert_eq!(link_width_to_f64(Some("4x")), Some(4.0)); + // comma-composite supported-widths -> max lane count + assert_eq!(link_width_to_f64(Some("1X,2X,4X")), Some(4.0)); + assert_eq!(link_width_to_f64(Some("1X, 2X")), Some(2.0)); + // partially-unrecognized composites still yield the max of the valid lanes + assert_eq!(link_width_to_f64(Some("2X,foo")), Some(2.0)); + assert_eq!(link_width_to_f64(Some("VL0-VL7")), None); + assert_eq!(link_width_to_f64(Some("")), None); + assert_eq!(link_width_to_f64(None), None); + } + + #[test] + fn test_link_speed_to_gbps_helper() { + // live GB200: bare numerics are already Gbps + assert_eq!(link_speed_to_gbps(Some("400")), Some(400.0)); + assert_eq!(link_speed_to_gbps(Some("100")), Some(100.0)); + assert_eq!(link_speed_to_gbps(Some("0")), Some(0.0)); + assert_eq!(link_speed_to_gbps(Some("2.5")), Some(2.5)); + // defensive: trailing "G"/"g" suffix (NVOS schema enum form) + assert_eq!(link_speed_to_gbps(Some("400G")), Some(400.0)); + assert_eq!(link_speed_to_gbps(Some("2.5g")), Some(2.5)); + // defensive: Mb/s and M suffix -> divide by 1000 + assert_eq!(link_speed_to_gbps(Some("1000Mb/s")), Some(1.0)); + assert_eq!(link_speed_to_gbps(Some("1000M")), Some(1.0)); + // unrecognized -> None + assert_eq!(link_speed_to_gbps(Some("hdr")), None); + assert_eq!(link_speed_to_gbps(Some("")), None); + assert_eq!(link_speed_to_gbps(None), None); + } + + #[test] + fn test_interface_link_speed_active_gbps() { + // bare numerics (live GB200 form) pass through as Gbps + for (raw, expected) in [("400", 400.0), ("100", 100.0), ("0", 0.0)] { + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string(raw), + ); + assert_eq!(sample.metric_type, "interface_link_speed_active"); + assert_eq!(sample.unit, "gbps", "speed unit must be gbps for {raw}"); + assert_eq!(sample.value, expected, "speed {raw}"); + } + + // defensive suffix forms + let (g_suffix, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("400G"), + ); + assert_eq!(g_suffix.unit, "gbps"); + assert_eq!(g_suffix.value, 400.0); + + let (g_frac, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("2.5G"), + ); + assert_eq!(g_frac.value, 2.5); + + let (mb, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("1000Mb/s"), + ); + assert_eq!(mb.unit, "gbps"); + assert_eq!(mb.value, 1.0); + } + + #[test] + fn test_interface_link_speed_unparseable_is_not_exported() { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("infiniband", &[]), + make_path_elem("state", &[]), + make_path_elem("speed", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("hdr")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unparseable speed must not emit a metric" + ); + } + + #[test] + fn test_oper_status_active_is_up() { + assert_eq!(oper_status_to_state(Some("ACTIVE")), "up"); + assert_eq!(oper_status_to_state(Some("active")), "up"); + assert_eq!(oper_status_to_state(Some("DOWN")), "down"); + } + + #[test] + fn test_process_subscribe_response_update_increments_notification_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Update( + proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }, + )), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 1.0); + assert_eq!(metrics.monitored_entities.get(), 1.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } + + // ---- /platform-general switch-level singleton coverage ----------------- + + /// Drive a single `/platform-general/` update and return the one + /// captured `MetricSample`, asserting the producer-level invariants (stream + /// `name`, `collector_type`, and that the switch-level singleton carries no + /// per-entity name label). + fn run_platform_general_leaf(tail: &[&str], val: proto::TypedValue) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert!( + sample.labels.is_empty(), + "switch-level singleton must not carry a per-entity name label" + ); + *sample + } + + #[test] + fn test_platform_general_numeric_leaf_mappings() { + // (leaf tail, raw bytes value, expected metric_type, expected value) + // values are the authoritative live GB200 Stage-0 capture. + let cases: &[(&[&str], u64, &str)] = &[ + ( + &["state", "memory-used"], + 3_856_510_976, + "platform_memory_used", + ), + ( + &["state", "memory-total-size"], + 16_151_990_272, + "platform_memory_total", + ), + ( + &["state", "disk-total-size"], + 77_780_082_688, + "platform_disk_total", + ), + ( + &["state", "disk-used"], + 22_848_192_512, + "platform_disk_used", + ), + ]; + for (tail, raw, metric_type) in cases { + let sample = run_platform_general_leaf(tail, make_typed_value_uint(*raw)); + assert_eq!(sample.metric_type, *metric_type, "leaf {tail:?}"); + assert_eq!(sample.unit, "bytes", "leaf {tail:?} unit must be bytes"); + assert_eq!(sample.value, *raw as f64, "leaf {tail:?} value"); + } + } + + #[test] + fn test_platform_general_non_numeric_value_is_not_exported() { + // A numeric leaf whose value cannot be coerced to f64 emits nothing. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem("memory-used", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("not-a-number")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "non-numeric platform-general value must not emit a metric" + ); + } + + #[test] + fn test_platform_general_unmapped_string_leaf_is_not_exported() { + // A platform-general string leaf that is not one of the mapped info + // leaves (contact/location/platform-name) must fall through and emit + // nothing, while still being counted as the platform-general entity. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem("product-name", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("MQM9700")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + // the platform-general entity is still counted, but nothing is emitted + assert_eq!(count, 1); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unmapped platform-general string leaf must not emit a metric" + ); + } + + #[test] + fn test_platform_general_empty_info_string_is_not_exported() { + // CONTACT/LOCATION are empty on the GB200 rig; an empty info string + // carries no information and must emit nothing. + for leaf in ["contact", "location", "platform-name"] { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem(leaf, &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + assert_eq!( + count, 1, + "platform-general entity is still counted for {leaf}" + ); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty info string must not emit a metric for {leaf}" + ); + } + } + + #[test] + fn test_platform_general_node_description_info() { + // NODE-DESCRIPTION (row 864): a non-empty platform-name emits a single + // switch-level info-metric carrying the raw string as `node_description`. + let sample = run_platform_general_leaf_info( + &["state", "platform-name"], + "x86_64-nvidia_n5400_ld-r0", + ); + assert_eq!(sample.metric_type, "platform_node_description_info"); + assert_eq!(sample.unit, "info"); + assert_eq!(sample.value, 1.0); + assert_eq!( + sample.labels, + vec![( + Cow::Borrowed("node_description"), + "x86_64-nvidia_n5400_ld-r0".to_string() + )] + ); + } + + #[test] + fn test_platform_general_contact_and_location_info() { + // CONTACT (862) / LOCATION (863): non-empty strings emit their info + // series with the matching single label. + for (leaf, metric_type, label, raw) in [ + ( + "contact", + "platform_contact_info", + "contact", + "noc@example.com", + ), + ("location", "platform_location_info", "location", "rack-7"), + ] { + let sample = run_platform_general_leaf_info(&["state", leaf], raw); + assert_eq!(sample.metric_type, metric_type, "leaf {leaf}"); + assert_eq!(sample.unit, "info", "leaf {leaf}"); + assert_eq!(sample.value, 1.0, "leaf {leaf}"); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed(label), raw.to_string())], + "leaf {leaf}" + ); + } + } + + #[test] + fn test_platform_general_version_info_metrics() { + // OS-VERSION (868) / BMC-VERSION (869) / EROT-FW-VERSION (870): non-empty + // version strings under `/platform-general/versions/state` each emit one + // switch-level info-metric carrying the raw version in a single label. + // Values are the authoritative live GB200 Stage-0 capture. + for (tail, metric_type, label, raw) in [ + ( + ["versions", "state", "nos-version"], + "platform_os_version_info", + "os_version", + "nvos-25.02.2553", + ), + ( + ["versions", "state", "fw-version-bmc"], + "platform_bmc_version_info", + "bmc_version", + "88.0002.1336", + ), + ( + ["versions", "state", "fw-version-erot"], + "platform_erot_version_info", + "erot_version", + "01.04.0026.0000_n04", + ), + ] { + let sample = run_platform_general_leaf_info(&tail, raw); + assert_eq!(sample.metric_type, metric_type, "leaf {tail:?}"); + assert_eq!(sample.unit, "info", "leaf {tail:?}"); + assert_eq!(sample.value, 1.0, "leaf {tail:?}"); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed(label), raw.to_string())], + "leaf {tail:?}" + ); + } + } + + #[test] + fn test_platform_general_empty_version_string_is_not_exported() { + // An empty version string carries no information and must emit nothing, + // while still being counted as the platform-general entity. + for tail in [ + ["versions", "state", "nos-version"], + ["versions", "state", "fw-version-bmc"], + ["versions", "state", "fw-version-erot"], + ] { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + assert_eq!( + count, 1, + "platform-general entity is still counted for {tail:?}" + ); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty version string must not emit a metric for {tail:?}" + ); + } + } + + /// Drive a single `/platform-general/` string update and return the + /// one captured info `MetricSample`. Unlike `run_platform_general_leaf`, the + /// switch-level info series carries a single string label (no per-entity + /// name), so the empty-labels invariant does not apply. + fn run_platform_general_leaf_info(tail: &[&str], raw: &str) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(make_typed_value_string(raw)), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + *sample } } diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index 7c4f8e4bfb..156aabdb4f 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -187,6 +187,7 @@ struct GnmiClientProvider { switch_ip: String, port: u16, request_timeout: Duration, + dangerously_skip_tls_verification: bool, credentials: Arc, } @@ -215,6 +216,7 @@ impl GnmiClientProvider { credentials.username, credentials.password, self.request_timeout, + self.dangerously_skip_tls_verification, ), generation, )) @@ -459,6 +461,7 @@ pub fn spawn_gnmi_collector( switch_ip, port: gnmi_config.gnmi_port, request_timeout: gnmi_config.request_timeout, + dangerously_skip_tls_verification: gnmi_config.dangerously_skip_tls_verification, credentials: Arc::new(GnmiCredentialCache::new( credential_provider, endpoint.addr.clone(), @@ -846,6 +849,7 @@ mod tests { switch_ip: addr.ip.to_string(), port: 9339, request_timeout: Duration::from_secs(1), + dangerously_skip_tls_verification: false, credentials: Arc::new(GnmiCredentialCache::new(provider, addr)), } } diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index d9f53dd5a3..d133b13c5e 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -33,6 +33,9 @@ const NVUE_SYSTEM_HEALTH: &str = "/nvue_v1/system/health"; const NVUE_CLUSTER_APPS: &str = "/nvue_v1/cluster/apps"; const NVUE_SDN_PARTITIONS: &str = "/nvue_v1/sdn/partition"; const NVUE_INTERFACES: &str = "/nvue_v1/interface"; +const NVUE_PLATFORM_ENVIRONMENT_FAN: &str = "/nvue_v1/platform/environment/fan"; +const NVUE_PLATFORM_ENVIRONMENT_TEMPERATURE: &str = "/nvue_v1/platform/environment/temperature"; +const NVUE_PLATFORM_ENVIRONMENT: &str = "/nvue_v1/platform/environment"; #[derive(Clone)] pub struct UsernamePassword { @@ -125,6 +128,36 @@ impl RestClient { self.do_get(url, &[]).await.map(Some) } + pub async fn get_platform_environment_fan( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_fan_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT_FAN)?; + self.do_get(url, &[]).await.map(Some) + } + + pub async fn get_platform_environment_temperature( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_temperature_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT_TEMPERATURE)?; + self.do_get(url, &[]).await.map(Some) + } + + pub async fn get_platform_environment( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_status_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT)?; + self.do_get(url, &[]).await.map(Some) + } + pub async fn get_interfaces(&self) -> Result, HealthError> { if !self.paths.interfaces_enabled { return Ok(None); @@ -287,6 +320,40 @@ pub struct SdnPartition { pub num_gpus: Option, } +pub type FanEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct FanData { + /// Fan maximum speed in RPM, scraped as string (e.g. "33000") + #[serde(rename = "max-speed")] + pub max_speed: Option, +} + +pub type TemperatureEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct TempData { + /// Current temperature Celsius, scraped as string (e.g. "43.00"). + /// Field is optional per sensor + pub current: Option, + /// Maximum (warning) threshold in Celsius as string (e.g. "105.00"). + pub max: Option, + /// Critical threshold in Celsius as a string (e.g. "120.00"). + pub crit: Option, + /// Sensor state as string (e.g. "ok"). + pub state: Option, +} + +/// `/nvue_v1/platform/environment` summary. Keys are aggregate status +/// entries (e.g. `FAN_STATUS`) as well as the `fan`/`temperature` subtrees +pub type PlatformEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct EnvItem { + /// Aggregate status string (e.g. "green"/"amber" for `FAN_STATUS`). + pub state: Option, +} + pub type InterfacesResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] @@ -520,6 +587,102 @@ mod tests { assert!(eth0.link.speed.is_none()); } + #[test] + fn test_parse_platform_environment_fan() { + let json = r#"{ + "FAN1/1": { + "current-speed": "10096", + "direction": "F2B", + "max-speed": "33000", + "min-speed": "6000", + "state": "ok" + }, + "FAN1/2": { + "current-speed": "9800", + "direction": "F2B", + "max-speed": "33000", + "min-speed": "6000", + "state": "ok" + } + }"#; + + let resp: FanEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 2); + assert_eq!(resp["FAN1/1"].max_speed.as_deref(), Some("33000")); + assert_eq!(resp["FAN1/2"].max_speed.as_deref(), Some("33000")); + } + + #[test] + fn test_parse_platform_environment_fan_missing_max_speed() { + let json = r#"{ + "FAN1/1": { + "current-speed": "10096", + "direction": "F2B", + "min-speed": "6000", + "state": "ok" + } + }"#; + + let resp: FanEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 1); + assert!(resp["FAN1/1"].max_speed.is_none()); + } + + #[test] + fn test_parse_platform_environment_temperature() { + let json = r#"{ + "ASIC1": {"crit": "120.00", "current": "43.00", "max": "105.00", "state": "ok"}, + "Ambient-MNG-Temp": {"current": "27.00", "state": "ok"}, + "PDB-Conv-1-Temp": {"crit": "115.00", "current": "38.00", "state": "ok"} + }"#; + + let resp: TemperatureEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 3); + + let asic1 = &resp["ASIC1"]; + assert_eq!(asic1.current.as_deref(), Some("43.00")); + assert_eq!(asic1.max.as_deref(), Some("105.00")); + assert_eq!(asic1.crit.as_deref(), Some("120.00")); + assert_eq!(asic1.state.as_deref(), Some("ok")); + + // Ambient sensor reports only current + state. + let ambient = &resp["Ambient-MNG-Temp"]; + assert_eq!(ambient.current.as_deref(), Some("27.00")); + assert!(ambient.max.is_none()); + assert!(ambient.crit.is_none()); + assert_eq!(ambient.state.as_deref(), Some("ok")); + + // PDB sensor has crit + current + state but no max. + let pdb = &resp["PDB-Conv-1-Temp"]; + assert_eq!(pdb.crit.as_deref(), Some("115.00")); + assert!(pdb.max.is_none()); + } + + #[test] + fn test_parse_platform_environment_fan_status() { + // Parent summary carries the aggregate `FAN_STATUS` LED entry alongside + // nested `fan`/`temperature` subtree objects of a different shape. The + // LED entry parses into `state`; the nested objects parse with `state` + // absent (serde ignores unknown keys) and are skipped by callers. + let json = r#"{ + "FAN_STATUS": {"state": "green", "type": "led"}, + "PSU_STATUS": {"state": "amber", "type": "led"}, + "fan": { + "FAN1/1": {"current-speed": "10096", "max-speed": "33000", "state": "ok"} + }, + "temperature": { + "ASIC1": {"current": "43.00", "state": "ok"} + } + }"#; + + let resp: PlatformEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp["FAN_STATUS"].state.as_deref(), Some("green")); + assert_eq!(resp["PSU_STATUS"].state.as_deref(), Some("amber")); + // nested subtree objects have no top-level state -> None. + assert!(resp["fan"].state.is_none()); + assert!(resp["temperature"].state.is_none()); + } + #[test] fn test_parse_empty_responses() { let empty_map: ClusterAppsResponse = serde_json::from_str("{}").unwrap(); @@ -530,5 +693,14 @@ mod tests { let empty_interfaces: InterfacesResponse = serde_json::from_str("{}").unwrap(); assert!(empty_interfaces.is_empty()); + + let empty_fans: FanEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_fans.is_empty()); + + let empty_temps: TemperatureEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_temps.is_empty()); + + let empty_env: PlatformEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_env.is_empty()); } } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 2165a5f9d2..3d20042853 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -28,33 +28,45 @@ use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; const COLLECTOR_NAME: &str = "nvue_rest"; -fn system_health_to_f64(status: Option<&str>) -> f64 { +const SYSTEM_HEALTH_STATES: &[&str] = &["ok", "not_ok", "unknown"]; + +fn system_health_to_state(status: Option<&str>) -> &'static str { match status { - Some("OK") => 1.0, - Some("Not OK") => 2.0, - _ => 0.0, + Some("OK") => "ok", + Some("Not OK") => "not_ok", + _ => "unknown", } } -fn partition_health_to_f64(status: Option<&str>) -> f64 { +const PARTITION_HEALTH_STATES: &[&str] = &[ + "healthy", + "degraded_bandwidth", + "degraded", + "unhealthy", + "unknown", +]; + +fn partition_health_to_state(status: Option<&str>) -> &'static str { match status { - Some("healthy") => 1.0, - Some("degraded_bandwidth") => 2.0, - Some("degraded") => 3.0, - Some("unhealthy") => 4.0, - _ => 0.0, + Some("healthy") => "healthy", + Some("degraded_bandwidth") => "degraded_bandwidth", + Some("degraded") => "degraded", + Some("unhealthy") => "unhealthy", + _ => "unknown", } } -fn app_status_to_f64(status: Option<&str>) -> f64 { +const APP_STATUS_STATES: &[&str] = &["ok", "not_ok", "unknown"]; + +fn app_status_to_state(status: Option<&str>) -> &'static str { match status { - Some("ok") => 1.0, - Some("not ok") => 2.0, - _ => 0.0, + Some("ok") => "ok", + Some("not ok") => "not_ok", + _ => "unknown", } } -/// code "0" means no issue; any other opcode indicates a problem +/// "0" -> no issue. Any other opcode indicates a problem fn diagnostic_opcode_to_f64(code: &str) -> f64 { match code { "0" => 0.0, @@ -62,6 +74,48 @@ fn diagnostic_opcode_to_f64(code: &str) -> f64 { } } +/// NVUE reports fan max-speed as a string (e.g. "33000"). Parse it to RPM. +/// Returns None when the field is absent or unparseable. +fn fan_max_speed_to_f64(max_speed: Option<&str>) -> Option { + max_speed.and_then(|s| s.trim().parse::().ok()) +} + +/// NVUE reports temps (current/max/crit) as Celsius strings (e.g. "105.00"). +/// Parse to f64. Returns None when the field is absent or unparseable. +fn temp_to_f64(value: Option<&str>) -> Option { + value.and_then(|s| s.trim().parse::().ok()) +} + +const TEMP_STATE_STATES: &[&str] = &["ok", "not_ok"]; + +/// Sensor `state` -> StateSet: "ok" (case-insensitive) => "ok", other present +/// => "not_ok", absent => None. +fn temp_state_to_state(state: Option<&str>) -> Option<&'static str> { + state.map(|s| { + if s.trim().eq_ignore_ascii_case("ok") { + "ok" + } else { + "not_ok" + } + }) +} + +const FAN_LED_STATES: &[&str] = &["ok", "not_ok"]; + +/// `FAN_STATUS` LED -> StateSet: "green"/"ok" (case-insensitive) => "ok", +/// other non-empty => "not_ok", absent/empty => None. +fn fan_led_to_state(state: Option<&str>) -> Option<&'static str> { + let s = state?.trim(); + if s.is_empty() { + return None; + } + if s.eq_ignore_ascii_case("green") || s.eq_ignore_ascii_case("ok") { + Some("ok") + } else { + Some("not_ok") + } +} + pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, pub data_sink: Option>, @@ -135,8 +189,8 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_system_health().await { Ok(Some(health)) => { - let value = system_health_to_f64(health.status.as_deref()); - self.emit_metric("system_health", None, value, "state", vec![]); + let current = system_health_to_state(health.status.as_deref()); + self.emit_state_set("system_health", None, current, SYSTEM_HEALTH_STATES, vec![]); entity_count += 1; } Ok(None) => {} @@ -154,12 +208,12 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_cluster_apps().await { Ok(Some(apps)) => { for (name, app) in &apps { - let value = app_status_to_f64(app.status.as_deref()); - self.emit_metric( + let current = app_status_to_state(app.status.as_deref()); + self.emit_state_set( "cluster_app", Some(name), - value, - "state", + current, + APP_STATUS_STATES, vec![(Cow::Borrowed("app_name"), name.clone())], ); entity_count += 1; @@ -181,18 +235,18 @@ impl PeriodicCollector for NvueRestCollector { Ok(Some(partitions)) => { for (part_id, partition) in &partitions { let part_name = partition.name.as_deref().unwrap_or(part_id); - let health_value = partition_health_to_f64(partition.health.as_deref()); + let health_state = partition_health_to_state(partition.health.as_deref()); let gpu_count = partition.num_gpus.unwrap_or(0) as f64; let partition_labels = vec![ (Cow::Borrowed("partition_id"), part_id.clone()), (Cow::Borrowed("partition_name"), part_name.to_string()), ]; - self.emit_metric( + self.emit_state_set( "partition_health", Some(part_id), - health_value, - "state", + health_state, + PARTITION_HEALTH_STATES, partition_labels.clone(), ); self.emit_metric( @@ -246,6 +300,118 @@ impl PeriodicCollector for NvueRestCollector { } } + match self.client.get_platform_environment_fan().await { + Ok(Some(fans)) => { + for (fan_name, fan) in &fans { + // Only emit when max-speed parses. Absent or garbage emits nothing. + if let Some(value) = fan_max_speed_to_f64(fan.max_speed.as_deref()) { + self.emit_metric( + "fan_max_speed", + Some(fan_name), + value, + "rpm", + vec![(Cow::Borrowed("fan_name"), fan_name.clone())], + ); + entity_count += 1; + } + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment fan" + ); + } + } + + match self.client.get_platform_environment_temperature().await { + Ok(Some(temps)) => { + for (sensor_name, temp) in &temps { + // Each field is optional. Emit only those present and parseable. + let sensor_label = || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; + + if let Some(value) = temp_to_f64(temp.current.as_deref()) { + self.emit_metric( + "platform_temperature", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + if let Some(value) = temp_to_f64(temp.max.as_deref()) { + self.emit_metric( + "platform_temperature_max", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + if let Some(value) = temp_to_f64(temp.crit.as_deref()) { + self.emit_metric( + "platform_temperature_critical", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + // Absent state emits nothing. Present state emits one 0/1 series per state. + if let Some(current) = temp_state_to_state(temp.state.as_deref()) { + self.emit_state_set( + "platform_temperature_state", + Some(sensor_name), + current, + TEMP_STATE_STATES, + sensor_label(), + ); + entity_count += 1; + } + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment temperature" + ); + } + } + + match self.client.get_platform_environment().await { + Ok(Some(env)) => { + // Switch-level FAN_STATUS LED. Emit only when present and mappable. + if let Some(current) = env + .get("FAN_STATUS") + .and_then(|s| fan_led_to_state(s.state.as_deref())) + { + self.emit_state_set("fan_led", None, current, FAN_LED_STATES, vec![]); + entity_count += 1; + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment status" + ); + } + } + if saw_auth_failure { tracing::warn!( switch_id = %self.switch_id, @@ -341,6 +507,38 @@ impl NvueRestCollector { .into(), )); } + + /// Emit an OpenMetrics StateSet: one 0/1 series per state (current => 1.0), + /// each carrying `labels` plus a `state` label. `key_base` is suffixed with + /// the state name for a unique per-series key. Unit is always "state". + fn emit_state_set( + &self, + metric_type: &str, + key_base: Option<&str>, + current_state: &str, + all_states: &[&str], + labels: Vec<(Cow<'static, str>, String)>, + ) { + for state in all_states { + let mut series_labels = labels.clone(); + series_labels.push((Cow::Borrowed("state"), state.to_string())); + + // suffix state onto the qualifier for a unique per-series key + // (switch-level series use the state name alone). + let qualifier = match key_base { + Some(base) => format!("{base}:{state}"), + None => (*state).to_string(), + }; + + self.emit_metric( + metric_type, + Some(&qualifier), + if *state == current_state { 1.0 } else { 0.0 }, + "state", + series_labels, + ); + } + } } #[cfg(test)] @@ -357,30 +555,72 @@ mod tests { use crate::bmc::BoxFuture; use crate::config::NvueRestPaths; + /// Assert StateSet semantics: one 0/1 series per state (current => 1.0), + /// each with unit "state" and a `state` label. `entity` (if set) is present + /// on every series. + fn assert_state_set( + samples: &[MetricSample], + metric_type: &str, + entity: Option<(&str, &str)>, + all_states: &[&str], + current: &str, + ) { + let series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == metric_type) + .collect(); + assert_eq!( + series.len(), + all_states.len(), + "{metric_type}: expected one series per state" + ); + for state in all_states { + let sample = series + .iter() + .find(|s| s.labels.iter().any(|(k, v)| k == "state" && v == state)) + .unwrap_or_else(|| panic!("{metric_type}: missing series for state {state}")); + assert_eq!(sample.unit, "state", "state {state}"); + assert_eq!( + sample.value, + if *state == current { 1.0 } else { 0.0 }, + "{metric_type} state {state}: value (current={current})" + ); + if let Some((label, value)) = entity { + assert!( + sample.labels.iter().any(|(k, v)| k == label && v == value), + "{metric_type} state {state}: missing entity label {label}={value}" + ); + } + } + } + #[test] fn test_system_health_mapping() { - assert_eq!(system_health_to_f64(Some("OK")), 1.0); - assert_eq!(system_health_to_f64(Some("Not OK")), 2.0); - assert_eq!(system_health_to_f64(None), 0.0); - assert_eq!(system_health_to_f64(Some("unknown_value")), 0.0); + assert_eq!(system_health_to_state(Some("OK")), "ok"); + assert_eq!(system_health_to_state(Some("Not OK")), "not_ok"); + assert_eq!(system_health_to_state(None), "unknown"); + assert_eq!(system_health_to_state(Some("unknown_value")), "unknown"); } #[test] fn test_partition_health_mapping() { - assert_eq!(partition_health_to_f64(Some("unknown")), 0.0); - assert_eq!(partition_health_to_f64(Some("healthy")), 1.0); - assert_eq!(partition_health_to_f64(Some("degraded_bandwidth")), 2.0); - assert_eq!(partition_health_to_f64(Some("degraded")), 3.0); - assert_eq!(partition_health_to_f64(Some("unhealthy")), 4.0); - assert_eq!(partition_health_to_f64(None), 0.0); + assert_eq!(partition_health_to_state(Some("unknown")), "unknown"); + assert_eq!(partition_health_to_state(Some("healthy")), "healthy"); + assert_eq!( + partition_health_to_state(Some("degraded_bandwidth")), + "degraded_bandwidth" + ); + assert_eq!(partition_health_to_state(Some("degraded")), "degraded"); + assert_eq!(partition_health_to_state(Some("unhealthy")), "unhealthy"); + assert_eq!(partition_health_to_state(None), "unknown"); } #[test] fn test_app_status_mapping() { - assert_eq!(app_status_to_f64(Some("ok")), 1.0); - assert_eq!(app_status_to_f64(Some("not ok")), 2.0); - assert_eq!(app_status_to_f64(None), 0.0); - assert_eq!(app_status_to_f64(Some("other")), 0.0); + assert_eq!(app_status_to_state(Some("ok")), "ok"); + assert_eq!(app_status_to_state(Some("not ok")), "not_ok"); + assert_eq!(app_status_to_state(None), "unknown"); + assert_eq!(app_status_to_state(Some("other")), "unknown"); } #[test] @@ -391,11 +631,456 @@ mod tests { assert_eq!(diagnostic_opcode_to_f64("57"), 1.0); } + #[test] + fn test_fan_max_speed_parsing() { + assert_eq!(fan_max_speed_to_f64(Some("33000")), Some(33000.0)); + assert_eq!(fan_max_speed_to_f64(Some(" 33000 ")), Some(33000.0)); + assert_eq!(fan_max_speed_to_f64(Some("6000")), Some(6000.0)); + assert_eq!(fan_max_speed_to_f64(Some("not-a-number")), None); + assert_eq!(fan_max_speed_to_f64(Some("")), None); + assert_eq!(fan_max_speed_to_f64(None), None); + } + + #[test] + fn test_temp_to_f64_parsing() { + assert_eq!(temp_to_f64(Some("105.00")), Some(105.0)); + assert_eq!(temp_to_f64(Some(" 43 ")), Some(43.0)); + assert_eq!(temp_to_f64(Some("120.00")), Some(120.0)); + assert_eq!(temp_to_f64(Some("x")), None); + assert_eq!(temp_to_f64(Some("")), None); + assert_eq!(temp_to_f64(None), None); + } + + #[test] + fn test_temp_state_to_state_mapping() { + assert_eq!(temp_state_to_state(Some("ok")), Some("ok")); + assert_eq!(temp_state_to_state(Some("OK")), Some("ok")); + assert_eq!(temp_state_to_state(Some(" ok ")), Some("ok")); + assert_eq!(temp_state_to_state(Some("warning")), Some("not_ok")); + assert_eq!(temp_state_to_state(Some("")), Some("not_ok")); + // absent => None (emit nothing, never fabricate) + assert_eq!(temp_state_to_state(None), None); + } + + #[test] + fn test_fan_led_to_state_mapping() { + // green/ok (case-insensitive) => "ok" + assert_eq!(fan_led_to_state(Some("green")), Some("ok")); + assert_eq!(fan_led_to_state(Some("GREEN")), Some("ok")); + assert_eq!(fan_led_to_state(Some(" green ")), Some("ok")); + assert_eq!(fan_led_to_state(Some("ok")), Some("ok")); + assert_eq!(fan_led_to_state(Some("OK")), Some("ok")); + // any other non-empty value => "not_ok" + assert_eq!(fan_led_to_state(Some("amber")), Some("not_ok")); + assert_eq!(fan_led_to_state(Some("red")), Some("not_ok")); + // absent/empty => None (emit nothing) + assert_eq!(fan_led_to_state(Some("")), None); + assert_eq!(fan_led_to_state(Some(" ")), None); + assert_eq!(fan_led_to_state(None), None); + } + + /// Drives run_iteration's fan parse + emit logic against a captured sink, + /// asserting max-speed sample shape. Table-driven. + #[test] + fn test_fan_max_speed_emit() { + use crate::collectors::nvue::rest::client::FanEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + struct Case { + name: &'static str, + json: &'static str, + // (fan_name, expected_value) pairs that MUST be emitted. + expected: &'static [(&'static str, f64)], + // Fan names that MUST NOT produce a sample. + absent: &'static [&'static str], + } + + let cases = [ + Case { + name: "two healthy fans emit max-speed", + json: r#"{ + "FAN1/1": {"current-speed": "10096", "direction": "F2B", "max-speed": "33000", "min-speed": "6000", "state": "ok"}, + "FAN1/2": {"current-speed": "9800", "direction": "F2B", "max-speed": "33000", "min-speed": "6000", "state": "ok"} + }"#, + expected: &[("FAN1/1", 33000.0), ("FAN1/2", 33000.0)], + absent: &[], + }, + Case { + name: "missing max-speed emits nothing", + json: r#"{ + "FAN1/1": {"current-speed": "10096", "min-speed": "6000", "state": "ok"} + }"#, + expected: &[], + absent: &["FAN1/1"], + }, + Case { + name: "garbage max-speed emits nothing", + json: r#"{ + "FAN1/1": {"max-speed": "bogus", "state": "ok"} + }"#, + expected: &[], + absent: &["FAN1/1"], + }, + ]; + + for case in cases { + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let fans: FanEnvironmentResponse = + serde_json::from_str(case.json).expect("fan json parses"); + // Mirror run_iteration's emit loop exactly. + for (fan_name, fan) in &fans { + if let Some(value) = fan_max_speed_to_f64(fan.max_speed.as_deref()) { + collector.emit_metric( + "fan_max_speed", + Some(fan_name), + value, + "rpm", + vec![(Cow::Borrowed("fan_name"), fan_name.clone())], + ); + } + } + + let samples = sink.samples.lock().unwrap(); + assert_eq!( + samples.len(), + case.expected.len(), + "case '{}': unexpected emitted sample count", + case.name + ); + + for (fan_name, expected_value) in case.expected { + let sample = samples + .iter() + .find(|s| { + s.labels + .iter() + .any(|(k, v)| k == "fan_name" && v == fan_name) + }) + .unwrap_or_else(|| { + panic!("case '{}': no sample for fan {fan_name}", case.name) + }); + + assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); + assert_eq!(sample.metric_type, "fan_max_speed", "case '{}'", case.name); + assert_eq!(sample.unit, "rpm", "case '{}'", case.name); + assert_eq!(sample.value, *expected_value, "case '{}'", case.name); + assert_eq!( + sample.key, + format!("fan_max_speed:{fan_name}"), + "case '{}'", + case.name + ); + assert_eq!(sample.labels.len(), 1, "case '{}'", case.name); + assert_eq!(sample.labels[0].0, "fan_name", "case '{}'", case.name); + assert_eq!(sample.labels[0].1, *fan_name, "case '{}'", case.name); + } + + for fan_name in case.absent { + assert!( + !samples.iter().any(|s| s + .labels + .iter() + .any(|(k, v)| k == "fan_name" && v == fan_name)), + "case '{}': fan {fan_name} should not emit a sample", + case.name + ); + } + } + } + + /// Drives run_iteration's temperature parse + emit logic against a captured + /// sink. A full sensor (ASIC1) emits all four series. A sparse sensor + /// (current + state only) emits two and must NOT fabricate absent max/crit. + #[test] + fn test_platform_temperature_emit() { + use crate::collectors::nvue::rest::client::TemperatureEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let json = r#"{ + "ASIC1": {"crit": "120.00", "current": "43.00", "max": "105.00", "state": "ok"}, + "Ambient-MNG-Temp": {"current": "27.00", "state": "ok"} + }"#; + + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let temps: TemperatureEnvironmentResponse = + serde_json::from_str(json).expect("temperature json parses"); + // Mirror run_iteration's emit loop exactly. + for (sensor_name, temp) in &temps { + let sensor_label = || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; + if let Some(value) = temp_to_f64(temp.current.as_deref()) { + collector.emit_metric( + "platform_temperature", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(value) = temp_to_f64(temp.max.as_deref()) { + collector.emit_metric( + "platform_temperature_max", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(value) = temp_to_f64(temp.crit.as_deref()) { + collector.emit_metric( + "platform_temperature_critical", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(current) = temp_state_to_state(temp.state.as_deref()) { + collector.emit_state_set( + "platform_temperature_state", + Some(sensor_name), + current, + TEMP_STATE_STATES, + sensor_label(), + ); + } + } + + let samples = sink.samples.lock().unwrap(); + // ASIC1: current + max + crit (3) + state StateSet (2) = 5. + // Ambient-MNG-Temp: current (1) + state StateSet (2) = 3. Total 8. + assert_eq!(samples.len(), 8, "unexpected emitted sample count"); + + // Helper: find a sample by metric_type + sensor label. + let find = |metric_type: &str, sensor: &str| { + samples.iter().find(|s| { + s.metric_type == metric_type + && s.labels.iter().any(|(k, v)| k == "sensor" && v == sensor) + }) + }; + + // ASIC1: the three scalar temperature series present with correct + // name/unit/value/label/key. + let expected_asic1: &[(&str, &str, f64)] = &[ + ("platform_temperature", "celsius", 43.0), + ("platform_temperature_max", "celsius", 105.0), + ("platform_temperature_critical", "celsius", 120.0), + ]; + for (metric_type, unit, value) in expected_asic1 { + let sample = find(metric_type, "ASIC1") + .unwrap_or_else(|| panic!("no ASIC1 sample for {metric_type}")); + assert_eq!(sample.name, COLLECTOR_NAME); + assert_eq!(&sample.metric_type, metric_type); + assert_eq!(&sample.unit, unit); + assert_eq!(sample.value, *value, "value for {metric_type}"); + assert_eq!(sample.key, format!("{metric_type}:ASIC1")); + assert_eq!(sample.labels.len(), 1); + assert_eq!(sample.labels[0].0, "sensor"); + assert_eq!(sample.labels[0].1, "ASIC1"); + } + + // ASIC1 state="ok" => StateSet: ok=1, not_ok=0. Sensor label preserved. + let asic1_state: Vec = samples + .iter() + .filter(|s| { + s.metric_type == "platform_temperature_state" + && s.labels.iter().any(|(k, v)| k == "sensor" && v == "ASIC1") + }) + .cloned() + .collect(); + assert_state_set( + &asic1_state, + "platform_temperature_state", + Some(("sensor", "ASIC1")), + TEMP_STATE_STATES, + "ok", + ); + + // Ambient-MNG-Temp: only current + state StateSet emitted. + let ambient_current = + find("platform_temperature", "Ambient-MNG-Temp").expect("ambient current sample"); + assert_eq!(ambient_current.value, 27.0); + assert_eq!(ambient_current.unit, "celsius"); + let ambient_state: Vec = samples + .iter() + .filter(|s| { + s.metric_type == "platform_temperature_state" + && s.labels + .iter() + .any(|(k, v)| k == "sensor" && v == "Ambient-MNG-Temp") + }) + .cloned() + .collect(); + assert_state_set( + &ambient_state, + "platform_temperature_state", + Some(("sensor", "Ambient-MNG-Temp")), + TEMP_STATE_STATES, + "ok", + ); + + // A sensor missing max/crit must NOT emit those series. + assert!( + find("platform_temperature_max", "Ambient-MNG-Temp").is_none(), + "ambient sensor without max must not emit platform_temperature_max" + ); + assert!( + find("platform_temperature_critical", "Ambient-MNG-Temp").is_none(), + "ambient sensor without crit must not emit platform_temperature_critical" + ); + } + + /// Drives run_iteration's fan_led parse + emit logic against a captured sink. + /// "green"/"ok" => 1.0, "amber" => 0.0, absent FAN_STATUS emits nothing. + #[test] + fn test_fan_led_emit() { + use crate::collectors::nvue::rest::client::PlatformEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + struct Case { + name: &'static str, + json: &'static str, + // expected current StateSet state, or None when nothing must emit. + expected: Option<&'static str>, + } + + let cases = [ + Case { + name: "green LED => ok", + json: r#"{"FAN_STATUS": {"state": "green", "type": "led"}}"#, + expected: Some("ok"), + }, + Case { + name: "ok LED => ok", + json: r#"{"FAN_STATUS": {"state": "ok", "type": "led"}}"#, + expected: Some("ok"), + }, + Case { + name: "amber LED => not_ok", + json: r#"{"FAN_STATUS": {"state": "amber", "type": "led"}}"#, + expected: Some("not_ok"), + }, + Case { + name: "absent FAN_STATUS emits nothing", + json: r#"{"PSU_STATUS": {"state": "green", "type": "led"}}"#, + expected: None, + }, + ]; + + for case in cases { + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let env: PlatformEnvironmentResponse = + serde_json::from_str(case.json).expect("env json parses"); + // Mirror run_iteration's emit logic exactly. + if let Some(current) = env + .get("FAN_STATUS") + .and_then(|s| fan_led_to_state(s.state.as_deref())) + { + collector.emit_state_set("fan_led", None, current, FAN_LED_STATES, vec![]); + } + + let samples = sink.samples.lock().unwrap(); + match case.expected { + Some(current) => { + // switch-level StateSet: no per-entity label, but a `state` + // label per series. Series keys are unique per state. + assert_state_set(&samples, "fan_led", None, FAN_LED_STATES, current); + for sample in samples.iter() { + assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); + let state = sample + .labels + .iter() + .find(|(k, _)| k == "state") + .map(|(_, v)| v.clone()) + .expect("state label present"); + assert_eq!( + sample.key, + format!("fan_led:{state}"), + "case '{}'", + case.name + ); + // switch-level: the only label is `state`. + assert_eq!( + sample.labels.len(), + 1, + "case '{}': fan_led is switch-level (only the state label)", + case.name + ); + } + } + None => assert_eq!( + samples.len(), + 0, + "case '{}': absent FAN_STATUS must not emit a sample", + case.name + ), + } + } + } + struct ScriptedProvider { calls: AtomicUsize, - // Each call pops the front of this queue; an empty queue yields an - // error. `HealthError` is not `Clone`, so we store and consume by - // value rather than indexing + `.cloned()`. + // Each call pops the front. An empty queue yields an error. HealthError + // isn't Clone, so we consume by value. responses: StdMutex>>, } @@ -442,6 +1127,9 @@ mod tests { cluster_apps_enabled: false, sdn_partitions_enabled: false, interfaces_enabled: false, + platform_environment_fan_enabled: false, + platform_environment_temperature_enabled: false, + platform_environment_status_enabled: false, } } @@ -496,7 +1184,7 @@ mod tests { assert!(collector.client.has_credentials()); assert_eq!( result.fetch_failures, 0, - "all four paths disabled → no HTTP, no failures" + "all paths disabled → no HTTP, no failures" ); // Subsequent iterations reuse the already-installed credentials. collector diff --git a/crates/health/src/collectors/nvue/tls.rs b/crates/health/src/collectors/nvue/tls.rs index a715e644c0..4c43d07138 100644 --- a/crates/health/src/collectors/nvue/tls.rs +++ b/crates/health/src/collectors/nvue/tls.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier}; use rustls::pki_types::{CertificateDer, ServerName, UnixTime}; -use rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme}; +use rustls::{DigitallySignedStruct, SignatureScheme}; // ! dangerous cert verifier that accepts any server certificate without validation. // ! only enable in test environments where you cannot replace NVOS self-signed certificates. @@ -63,12 +63,7 @@ impl ServerCertVerifier for AcceptAnyCertVerifier { } } -/// build a rustls ClientConfig that dangerously skips server certificate verification. -pub fn self_signed_tls_config() -> ClientConfig { - ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) - .with_safe_default_protocol_versions() - .expect("default protocol versions are valid") - .dangerous() - .with_custom_certificate_verifier(Arc::new(AcceptAnyCertVerifier)) - .with_no_client_auth() +/// Dangerous rustls verifier that accepts any server certificate without validation +pub fn accept_any_cert_verifier() -> Arc { + Arc::new(AcceptAnyCertVerifier) } diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index d05275d05c..0b4926a5a6 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -27,9 +27,31 @@ use crate::HealthError; use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::metrics::sanitize_unit; +use crate::metrics::{MetricLabel, sanitize_unit}; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample, SensorThresholdContext}; +#[derive(Clone, Copy)] +enum SensorRangeKind { + Max, + Min, +} + +impl SensorRangeKind { + fn metric_suffix(self) -> &'static str { + match self { + Self::Max => "range_max", + Self::Min => "range_min", + } + } + + fn label_value(self) -> &'static str { + match self { + Self::Max => "reading_range_max", + Self::Min => "reading_range_min", + } + } +} + /// Configuration for the sensor collector. pub struct SensorCollectorConfig { pub data_sink: Option>, @@ -256,6 +278,8 @@ impl SensorCollector { let metric_type = reading_type.to_snake_case().to_string(); let unit = sanitize_unit(&unit); + let range_max = sensor.reading_range_max.flatten(); + let range_min = sensor.reading_range_min.flatten(); let ( upper_fatal, @@ -299,10 +323,10 @@ impl SensorCollector { MetricSample { key: sensor.odata_id().to_string(), name: "hw_sensor".to_string(), - metric_type, - unit, + metric_type: metric_type.clone(), + unit: unit.clone(), value: reading, - labels: attributes, + labels: attributes.clone(), context: Some(SensorThresholdContext { entity_type: entity.entity_type().to_string(), sensor_id: sensor.base.id.clone(), @@ -312,14 +336,88 @@ impl SensorCollector { lower_critical, upper_caution, lower_caution, - range_max: sensor.reading_range_max.flatten(), - range_min: sensor.reading_range_min.flatten(), + range_max, + range_min, bmc_health, }), } .into(), )); + if self.include_sensor_thresholds { + self.emit_sensor_range_metric( + sensor.odata_id().to_string(), + &metric_type, + &unit, + &attributes, + SensorRangeKind::Max, + range_max, + ); + self.emit_sensor_range_metric( + sensor.odata_id().to_string(), + &metric_type, + &unit, + &attributes, + SensorRangeKind::Min, + range_min, + ); + } + 1 } + + fn emit_sensor_range_metric( + &self, + sensor_key: String, + reading_type: &str, + unit: &str, + attributes: &[MetricLabel], + range_kind: SensorRangeKind, + value: Option, + ) { + let Some(value) = value else { return }; + let metric_suffix = range_kind.metric_suffix(); + let mut labels = attributes.to_vec(); + labels.push(( + Cow::Borrowed("sensor_range"), + range_kind.label_value().to_string(), + )); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("{sensor_key}/{metric_suffix}"), + name: "hw_sensor".to_string(), + metric_type: format!("{reading_type}_{metric_suffix}"), + unit: unit.to_string(), + value, + labels, + context: None, + } + .into(), + )); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sensor_range_kind_uses_documented_metric_suffixes_and_label_values() { + assert_eq!(SensorRangeKind::Max.metric_suffix(), "range_max"); + assert_eq!(SensorRangeKind::Max.label_value(), "reading_range_max"); + assert_eq!(SensorRangeKind::Min.metric_suffix(), "range_min"); + assert_eq!(SensorRangeKind::Min.label_value(), "reading_range_min"); + } + + #[test] + fn sensor_range_metric_contract_matches_matrix_surface() { + let reading_type = "fan_speed"; + let range_kind = SensorRangeKind::Max; + + assert_eq!( + format!("{reading_type}_{}", range_kind.metric_suffix()), + "fan_speed_range_max" + ); + assert_eq!(range_kind.label_value(), "reading_range_max"); + } } diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 3f1c035562..2363505973 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -958,6 +958,11 @@ pub struct NvueGnmiConfig { #[serde(with = "humantime_serde")] pub request_timeout: Duration, + /// Dangerously disable TLS certificate and hostname verification for NVUE gNMI. + /// + /// Defaults to false so strict TLS verification remains the default. + pub dangerously_skip_tls_verification: bool, + /// Enable gNMI ON_CHANGE subscription for live system-event messages. #[serde(alias = "system_events_subscription_enabled", alias = "events_enabled")] pub system_events_enabled: bool, @@ -972,6 +977,7 @@ impl Default for NvueGnmiConfig { gnmi_port: 9339, sample_interval: Duration::from_secs(300), request_timeout: Duration::from_secs(30), + dangerously_skip_tls_verification: false, system_events_enabled: true, paths: NvueGnmiPaths::default(), } @@ -983,6 +989,7 @@ impl Default for NvueGnmiConfig { pub struct NvueGnmiPaths { pub components_enabled: bool, pub interfaces_enabled: bool, + pub platform_general_enabled: bool, } impl Default for NvueGnmiPaths { @@ -990,6 +997,7 @@ impl Default for NvueGnmiPaths { Self { components_enabled: true, interfaces_enabled: true, + platform_general_enabled: true, } } } @@ -1024,6 +1032,10 @@ impl Default for NvueRestConfig { /// - cluster_apps_enabled: Poll `/nvue_v1/cluster/apps`. /// - sdn_partitions_enabled: Poll `/nvue_v1/sdn/partition` (including per-partition details) /// - interfaces_enabled: Poll `/nvue_v1/interface`. +/// - platform_environment_fan_enabled: Poll `/nvue_v1/platform/environment/fan`. +/// - platform_environment_temperature_enabled: Poll `/nvue_v1/platform/environment/temperature`. +/// - platform_environment_status_enabled: Poll `/nvue_v1/platform/environment` parent +/// summary for the aggregate `FAN_STATUS` LED state. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct NvueRestPaths { @@ -1031,6 +1043,9 @@ pub struct NvueRestPaths { pub cluster_apps_enabled: bool, pub sdn_partitions_enabled: bool, pub interfaces_enabled: bool, + pub platform_environment_fan_enabled: bool, + pub platform_environment_temperature_enabled: bool, + pub platform_environment_status_enabled: bool, } impl Default for NvueRestPaths { @@ -1040,6 +1055,9 @@ impl Default for NvueRestPaths { cluster_apps_enabled: true, sdn_partitions_enabled: true, interfaces_enabled: true, + platform_environment_fan_enabled: true, + platform_environment_temperature_enabled: true, + platform_environment_status_enabled: true, } } } @@ -1356,6 +1374,7 @@ mod tests { assert_eq!(gnmi.gnmi_port, 9339); assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); + assert!(!gnmi.dangerously_skip_tls_verification); assert!(gnmi.system_events_enabled); } else { panic!("nvue gnmi config should be enabled in example config"); @@ -1867,6 +1886,60 @@ system_events_enabled = false } } + #[test] + fn test_nvue_gnmi_dangerous_tls_skip_defaults_false_and_parses_true() { + let omitted = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(omitted)) + .extract() + .expect("failed to parse omitted tls flag"); + + let Configurable::Enabled(nvue) = config.collectors.nvue else { + panic!("nvue config should be enabled"); + }; + let Configurable::Enabled(gnmi) = nvue.gnmi else { + panic!("gnmi config should be enabled"); + }; + assert!(!gnmi.dangerously_skip_tls_verification); + + let enabled = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +dangerously_skip_tls_verification = true +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(enabled)) + .extract() + .expect("failed to parse enabled tls flag"); + + let Configurable::Enabled(nvue) = config.collectors.nvue else { + panic!("nvue config should be enabled"); + }; + let Configurable::Enabled(gnmi) = nvue.gnmi else { + panic!("gnmi config should be enabled"); + }; + assert!(gnmi.dangerously_skip_tls_verification); + } + #[test] fn test_static_endpoint_with_switch_serial() { let toml_content = r#" diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fb71066fb9..93ab6bf951 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -92,7 +92,7 @@ fn resource_attributes(context: &EventContext) -> Vec { attrs.push(kv("switch.id", switch_id.to_string())); } if let Some(serial) = context.switch_serial() { - attrs.push(kv("switch.serial", serial.to_string())); + attrs.push(kv("switch.serial_number", serial.to_string())); } if let Some(role) = context.switch_endpoint_role() { let endpoint_role = match role { @@ -237,6 +237,7 @@ pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportL /// be added when the health metric model exposes those temporality choices. pub fn build_metrics_export_request( batch: &[(EventContext, MetricSample)], + metric_name_prefix: &str, ) -> ExportMetricsServiceRequest { let observed_nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) @@ -246,19 +247,30 @@ pub fn build_metrics_export_request( let mut by_endpoint: HashMap, Vec)> = HashMap::new(); for (context, sample) in batch { + // Switch identity rides once on the resource attributes (switch.id, + // switch.serial_number, switch.ip). VictoriaMetrics flattens resource + // attributes onto every series, so promoting them onto the datapoint too + // only duplicates the same value under a second (underscore) label name. + let attributes: Vec = sample + .labels + .iter() + .map(|(k, v)| kv(k, v.clone())) + .collect(); + let data_point = NumberDataPoint { - attributes: sample - .labels - .iter() - .map(|(k, v)| kv(k, v.clone())) - .collect(), + attributes, time_unix_nano: observed_nanos, value: Some(number_data_point::Value::AsDouble(sample.value)), ..Default::default() }; let otlp_metric = OtlpMetric { - name: sample.metric_type.clone(), + // match the Prometheus sink's full series name exactly so Grafana queries + // resolve identically across both export paths. + name: format!( + "{}_{}_{}_{}", + metric_name_prefix, sample.name, sample.metric_type, sample.unit + ), description: String::new(), unit: sample.unit.clone(), data: Some(metric::Data::Gauge(OtlpGauge { @@ -470,7 +482,10 @@ mod tests { attr_value(&attrs, "switch.id"), Some(switch_id_attr.as_str()) ); - assert_eq!(attr_value(&attrs, "switch.serial"), Some("SN-SWITCH-001")); + assert_eq!( + attr_value(&attrs, "switch.serial_number"), + Some("SN-SWITCH-001") + ); assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("host")); assert_eq!(attr_bool_value(&attrs, "switch.is_primary"), Some(true)); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); @@ -518,7 +533,7 @@ mod tests { Some(switch_id_attr.as_str()) ); assert_eq!( - attr_value(&attrs, "switch.serial"), + attr_value(&attrs, "switch.serial_number"), Some("SN-SWITCH-BMC-001") ); assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("bmc")); @@ -691,10 +706,13 @@ mod tests { context: None, }; - let request = build_metrics_export_request(&[ - (rest_ctx, sample("nvue_rest")), - (gnmi_ctx, sample("nvue_gnmi")), - ]); + let request = build_metrics_export_request( + &[ + (rest_ctx, sample("nvue_rest")), + (gnmi_ctx, sample("nvue_gnmi")), + ], + "carbide_hardware_health", + ); let collector_types: std::collections::HashSet<_> = request .resource_metrics @@ -709,7 +727,7 @@ mod tests { } #[test] - fn metric_export_name_uses_metric_type() { + fn metric_export_name_uses_full_prometheus_series_name() { let ctx = test_context(); let sample = MetricSample { key: "asic0/oper_status".to_string(), @@ -721,11 +739,81 @@ mod tests { context: None, }; - let request = build_metrics_export_request(&[(ctx, sample)]); + let request = build_metrics_export_request(&[(ctx, sample)], "carbide_hardware_health"); let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; assert_eq!(metrics.len(), 1); - assert_eq!(metrics[0].name, "interface_oper_status"); + assert_eq!( + metrics[0].name, + "carbide_hardware_health_nvue_gnmi_interface_oper_status_state" + ); assert_eq!(metrics[0].unit, "state"); } + + #[test] + fn switch_nmxt_identity_is_resource_only_not_on_datapoint() { + let switch_id = test_switch_id("switch-nmxt"); + let switch_id_attr = switch_id.to_string(); + let context = EventContext { + endpoint_key: "11:22:33:44:55:66".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 1, 1)), + port: Some(443), + mac: MacAddress::from_str("11:22:33:44:55:66").expect("valid mac"), + }, + collector_type: "nvue_gnmi", + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: true, + nmxt_enabled: true, + })), + rack_id: Some(RackId::new("RACK_2")), + }; + let sample = MetricSample { + key: "effective_ber".to_string(), + name: "switch_nmxt".to_string(), + metric_type: "effective_ber".to_string(), + unit: "ratio".to_string(), + value: 0.5, + labels: vec![], + context: None, + }; + + let request = build_metrics_export_request(&[(context, sample)], "carbide_hardware_health"); + let resource_metrics = &request.resource_metrics[0]; + let metrics = &resource_metrics.scope_metrics[0].metrics; + + assert_eq!(metrics.len(), 1); + assert_eq!( + metrics[0].name, + "carbide_hardware_health_switch_nmxt_effective_ber_ratio" + ); + + let metric::Data::Gauge(gauge) = metrics[0].data.as_ref().expect("metric data") else { + panic!("expected gauge data"); + }; + // Identity must NOT be promoted onto the datapoint (VM duplicates it from the resource). + let attrs = &gauge.data_points[0].attributes; + assert_eq!(attr_value(attrs, "switch_serial"), None); + assert_eq!(attr_value(attrs, "switch_id"), None); + + // It lives once, on the resource (dotted form). + let resource_attrs = &resource_metrics + .resource + .as_ref() + .expect("resource") + .attributes; + assert_eq!( + attr_value(resource_attrs, "switch.serial_number"), + Some("SN-SWITCH-001") + ); + assert_eq!( + attr_value(resource_attrs, "switch.id"), + Some(switch_id_attr.as_str()) + ); + } } diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs index bb04dac56a..f8e944f4b7 100644 --- a/crates/health/src/otlp/metrics_drain.rs +++ b/crates/health/src/otlp/metrics_drain.rs @@ -31,6 +31,7 @@ pub(crate) struct OtlpMetricsDrainTask { endpoint: String, batch_size: usize, flush_interval: Duration, + metric_name_prefix: String, } impl OtlpMetricsDrainTask { @@ -39,12 +40,14 @@ impl OtlpMetricsDrainTask { endpoint: String, batch_size: usize, flush_interval: Duration, + metric_name_prefix: String, ) -> Self { Self { queue, endpoint, batch_size, flush_interval, + metric_name_prefix, } } @@ -133,7 +136,7 @@ impl OtlpMetricsDrainTask { return; } - let request = build_metrics_export_request(batch); + let request = build_metrics_export_request(batch, &self.metric_name_prefix); batch.clear(); let point_count = request diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index 4f422d9a8b..74cca0f713 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -130,6 +130,7 @@ impl OtlpSink { config.endpoint.clone(), config.batch_size, config.flush_interval, + prefix.to_string(), ); handle.spawn(metrics_drain.run()); diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 3822b24d31..3e154b546c 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -22,7 +22,18 @@ use dashmap::DashMap; use super::{CollectorEvent, DataSink, EventContext, MetricSample}; use crate::HealthError; -use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricsManager}; +use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricLabel, MetricsManager}; + +/// High-cardinality / free-text labels kept for OTLPSink but excluded from PrometheusSink +const PROMETHEUS_EXCLUDED_LABELS: &[&str] = &["status_message"]; + +fn filter_prometheus_labels(labels: &[MetricLabel]) -> Vec { + labels + .iter() + .filter(|(key, _)| !PROMETHEUS_EXCLUDED_LABELS.contains(&key.as_ref())) + .cloned() + .collect() +} pub struct PrometheusSink { collector_registry: Arc, @@ -204,7 +215,7 @@ impl DataSink for PrometheusSink { sample.unit.clone(), sample.value, ) - .with_labels(sample.labels.clone()), + .with_labels(filter_prometheus_labels(&sample.labels)), ); } Err(error) => { @@ -333,4 +344,27 @@ mod tests { assert_eq!(label_value("switch_slot_number"), Some("7")); assert_eq!(label_value("switch_tray_index"), Some("3")); } + + // status_message is excluded from Prometheus series; other labels (e.g. port_num) are retained. + #[test] + fn test_filter_prometheus_labels_drops_status_message() { + let labels: Vec = vec![ + ( + std::borrow::Cow::Borrowed("status_message"), + "No issue was observed".to_string(), + ), + (std::borrow::Cow::Borrowed("port_num"), "11".to_string()), + ]; + + let filtered = filter_prometheus_labels(&labels); + + assert!( + !filtered.iter().any(|(k, _)| k == "status_message"), + "status_message must be excluded from Prometheus series" + ); + assert!( + filtered.iter().any(|(k, v)| k == "port_num" && v == "11"), + "non-excluded labels must be retained" + ); + } }