From d37dd723fd258dbe3666cabf15cd19952331069c Mon Sep 17 00:00:00 2001 From: Jay Zhu Date: Thu, 25 Jun 2026 08:54:30 -0600 Subject: [PATCH 1/4] feat(health): enrich machine log metadata Propagate machine identity, serial, GPU driver version, NVLink domain, and component type through health endpoint context into log-related sinks. Driver version is derived from NICo machine discovery when all reported GPUs agree, or from static TOML metadata for static targets. Signed-off-by: Jay Zhu --- crates/health/benches/collector_pipeline.rs | 4 +- crates/health/benches/processor_pipeline.rs | 6 +- crates/health/benches/sink_pipeline.rs | 4 +- crates/health/src/api_client.rs | 147 ++++++++++++++++++- crates/health/src/config.rs | 13 +- crates/health/src/discovery/spawn.rs | 5 +- crates/health/src/endpoint/mod.rs | 4 +- crates/health/src/endpoint/model.rs | 50 ++++++- crates/health/src/endpoint/sources.rs | 9 +- crates/health/src/otlp/convert.rs | 56 ++++++- crates/health/src/processor/health_report.rs | 4 +- crates/health/src/sink/events.rs | 78 +++++++--- crates/health/src/sink/health_report.rs | 4 +- crates/health/src/sink/log_file.rs | 80 +++++++++- crates/health/src/sink/mod.rs | 8 +- crates/health/src/sink/prometheus.rs | 6 +- crates/health/src/sink/tracing.rs | 10 ++ 17 files changed, 452 insertions(+), 36 deletions(-) diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 273865b714..307e620382 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -21,7 +21,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ CollectorEvent, CompositeDataSink, DataSink, EventContext, FirmwareInfo, LogRecord, @@ -60,6 +60,8 @@ fn event_context() -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index d005eac8df..3e7c29b3d4 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -20,7 +20,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::processor::{ EventProcessingPipeline, EventProcessor, HealthReportProcessor, LeakEventProcessor, @@ -96,6 +96,8 @@ fn event_context() -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, } @@ -273,6 +275,8 @@ fn rack_event_contexts(rack_id: &str, tray_count: usize) -> Vec { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new(rack_id)), } diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 0e8e9d730f..3ad58c0179 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -21,7 +21,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ Classification, CollectorEvent, CompositeDataSink, DataSink, EventContext, HealthReport, @@ -70,6 +70,8 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 5ffe158532..ddfc3c4844 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -34,8 +34,8 @@ use url::Url; use crate::HealthError; use crate::bmc::{BmcClient, BoxFuture, CredentialProvider}; use crate::endpoint::{ - BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, - PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, ComponentType, EndpointMetadata, EndpointSource, + MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; /// [`ApiEndpointSource`]. @@ -444,6 +444,8 @@ impl ApiEndpointSource { .nvlink_info .as_ref() .and_then(|info| info.domain_uuid), + driver_version: unique_gpu_driver_version(machine.discovery_info.as_ref()), + component_type: machine_component_type(machine), }) }); @@ -590,6 +592,43 @@ fn cache_or_create_bmc_client( Ok(client) } +/// Returns the machine-level GPU driver version derived from discovery data. +/// +/// The NICo API reports driver versions per GPU. Health emits one machine-level +/// value only when there is exactly one unique non-empty version across the +/// reported GPUs. Empty strings are treated as missing data; conflicting +/// non-empty versions are treated as ambiguous and omitted. +fn unique_gpu_driver_version( + discovery_info: Option<&rpc::machine_discovery::DiscoveryInfo>, +) -> Option { + let discovery_info = discovery_info?; + let versions = discovery_info + .gpus + .iter() + .map(|gpu| gpu.driver_version.trim()) + .filter(|version| !version.is_empty()) + .map(str::to_string) + .collect::>(); + + (versions.len() == 1) + .then(|| versions.into_iter().next()) + .flatten() +} + +/// Classifies a Forge machine as the component category emitted with health telemetry. +fn machine_component_type(machine: &rpc::forge::Machine) -> ComponentType { + match rpc::forge::MachineType::try_from(machine.machine_type) { + Ok(rpc::forge::MachineType::Dpu) => ComponentType::Dpu, + Ok(rpc::forge::MachineType::Host) => ComponentType::ComputeNode, + Ok(rpc::forge::MachineType::PowerShelf | rpc::forge::MachineType::Unknown) | Err(_) => { + machine + .id + .map(|machine_id| ComponentType::from_machine_type(machine_id.machine_type())) + .unwrap_or(ComponentType::ComputeNode) + } + } +} + impl EndpointSource for ApiEndpointSource { fn fetch_bmc_hosts<'a>(&'a self) -> BoxFuture<'a, Result>, HealthError>> { Box::pin(self.fetch_bmc_hosts()) @@ -672,6 +711,8 @@ impl From for BmcCredentials { mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; + use carbide_test_support::value_scenarios; + use carbide_uuid::machine::MachineId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; @@ -712,6 +753,108 @@ mod tests { )?)) } + /// Builds discovery metadata with one GPU entry per supplied driver version. + fn discovery_with_driver_versions( + driver_versions: &[&str], + ) -> rpc::machine_discovery::DiscoveryInfo { + rpc::machine_discovery::DiscoveryInfo { + gpus: driver_versions + .iter() + .map(|driver_version| rpc::machine_discovery::Gpu { + driver_version: (*driver_version).to_string(), + ..Default::default() + }) + .collect(), + ..Default::default() + } + } + + /// Returns a known DPU ID for fallback classification tests. + fn dpu_machine_id() -> MachineId { + "fm100ds3gfip02lfgleidqoitqgh8d8mdc4a3j2tdncbjrfjtvrrhn2kleg" + .parse() + .expect("valid DPU machine id") + } + + /// Classifies a minimal Forge machine with the supplied type and optional ID. + fn component_type_for_machine_type( + (machine_type, machine_id): (rpc::forge::MachineType, Option), + ) -> ComponentType { + let machine = rpc::forge::Machine { + machine_type: machine_type as i32, + id: machine_id, + ..Default::default() + }; + + machine_component_type(&machine) + } + + /// Verifies that driver-version extraction emits only a unique non-empty value. + #[test] + fn unique_gpu_driver_version_uses_single_non_empty_version() { + value_scenarios!( + run = |discovery_info: Option| { + unique_gpu_driver_version(discovery_info.as_ref()) + }; + "missing discovery info" { + None => None, + } + + "no gpus" { + Some(discovery_with_driver_versions(&[])) => None, + } + + "empty gpu driver versions" { + Some(discovery_with_driver_versions(&["", " "])) => None, + } + + "one gpu driver version" { + Some(discovery_with_driver_versions(&["570.82"])) => Some("570.82".to_string()), + } + + "same gpu driver version repeated" { + Some(discovery_with_driver_versions(&["570.82", " 570.82 "])) => { + Some("570.82".to_string()) + }, + } + + "mixed gpu driver versions" { + Some(discovery_with_driver_versions(&["570.82", "580.12"])) => None, + } + ); + } + + /// Verifies that Forge host and DPU machine types map to telemetry categories. + #[test] + fn machine_component_type_uses_api_machine_type() { + value_scenarios!( + run = component_type_for_machine_type; + "host" { + (rpc::forge::MachineType::Host, None) => ComponentType::ComputeNode, + } + + "dpu" { + (rpc::forge::MachineType::Dpu, None) => ComponentType::Dpu, + } + + "unknown with dpu machine id" { + (rpc::forge::MachineType::Unknown, Some(dpu_machine_id())) => ComponentType::Dpu, + } + + "unknown without machine id" { + (rpc::forge::MachineType::Unknown, None) => ComponentType::ComputeNode, + } + + "power shelf with dpu machine id" { + (rpc::forge::MachineType::PowerShelf, Some(dpu_machine_id())) => ComponentType::Dpu, + } + + "power shelf without machine id" { + (rpc::forge::MachineType::PowerShelf, None) => ComponentType::ComputeNode, + } + ); + } + #[test] fn cache_returns_existing_client_on_matching_kind() { let mut cache: HashMap = HashMap::new(); diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 3f1c035562..d33cc0a9b4 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -114,12 +114,22 @@ pub struct StaticBmcEndpoint { #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticMachineEndpoint { + /// Stable NICo machine ID for this BMC endpoint. pub id: String, + + /// Optional chassis serial to emit as machine telemetry metadata. pub serial: Option, + + /// Optional uniform GPU driver version to emit for local/static validation. + pub driver_version: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, + + /// Optional NVLink domain UUID associated with this machine. pub nvlink_domain_uuid: Option, } @@ -2051,7 +2061,7 @@ ip = "10.0.1.2" mac = "11:22:33:44:55:11" username = "admin" password = "pass" -machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", driver_version = "570.82", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } "#; let config: Config = Figment::new() @@ -2067,6 +2077,7 @@ machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(machine.slot_number, Some(15)); assert_eq!(machine.tray_index, Some(5)); + assert_eq!(machine.driver_version.as_deref(), Some("570.82")); assert_eq!( machine.nvlink_domain_uuid.as_deref(), Some("00000000-0000-0000-0000-000000000000") diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d0c045dba1..f048251083 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -564,7 +564,8 @@ mod tests { }; use crate::endpoint::test_support::endpoint_with_creds; use crate::endpoint::{ - BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, ComponentType, EndpointMetadata, MachineData, SwitchData, + SwitchEndpointRole, }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; @@ -638,6 +639,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, }) } diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index e5e4a9e33b..5e67ff5648 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -19,8 +19,8 @@ mod model; mod sources; pub use model::{ - BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, - PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, ComponentType, EndpointMetadata, EndpointSource, + MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; pub use sources::{CompositeEndpointSource, StaticEndpointSource}; diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 80d318075d..da499c45a9 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -19,7 +19,7 @@ use std::borrow::Cow; use std::net::IpAddr; use std::sync::Arc; -use carbide_uuid::machine::MachineId; +use carbide_uuid::machine::{MachineId, MachineType}; use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; @@ -94,13 +94,61 @@ impl EndpointMetadata { } } +/// Hardware component category attached to machine health telemetry. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ComponentType { + /// Forge-managed host compute node. + ComputeNode, + + /// Data processing unit machine. + Dpu, +} + +impl ComponentType { + /// Returns the stable telemetry spelling for this component category. + pub const fn as_str(self) -> &'static str { + match self { + Self::ComputeNode => "compute_node", + Self::Dpu => "dpu", + } + } + + /// Maps a NICo machine type to its health telemetry component category. + pub const fn from_machine_type(machine_type: MachineType) -> Self { + match machine_type { + MachineType::Dpu => Self::Dpu, + MachineType::Host | MachineType::PredictedHost => Self::ComputeNode, + } + } +} + +/// Metadata that describes a machine endpoint for health telemetry. #[derive(Clone, Debug)] pub struct MachineData { + /// Stable NICo machine identifier. pub machine_id: MachineId, + + /// Hardware chassis serial discovered from machine DMI data, when known. pub machine_serial: Option, + + /// Physical rack slot where the machine is installed, when known. pub slot_number: Option, + + /// Compute tray index where the machine is installed, when known. pub tray_index: Option, + + /// NVLink domain UUID for the machine, when it participates in an NVLink domain. pub nvlink_domain_uuid: Option, + + /// Machine-level GPU driver version. + /// + /// This is populated only when API discovery reports exactly one unique + /// non-empty GPU driver version for the machine. It stays absent when the + /// version is unknown or the discovered GPUs report conflicting versions. + pub driver_version: Option, + + /// Component category derived from the machine type. + pub component_type: ComponentType, } #[derive(Clone, Debug)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index cad243113d..dd774c5f23 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -28,8 +28,8 @@ use crate::HealthError; use crate::bmc::{BmcClient, FixedCredentialProvider}; use crate::config::{StaticBmcEndpoint, StaticSwitchEndpointRole}; use crate::endpoint::{ - BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, - PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, ComponentType, EndpointMetadata, + EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; pub struct StaticEndpointSource { @@ -135,6 +135,8 @@ impl StaticEndpointSource { slot_number: machine.slot_number, tray_index: machine.tray_index, nvlink_domain_uuid, + driver_version: machine.driver_version.clone(), + component_type: ComponentType::from_machine_type(machine_id.machine_type()), })), Err(error) => { tracing::warn!( @@ -392,6 +394,7 @@ mod tests { slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), + driver_version: Some("570.82".to_string()), }), power_shelf: None, switch: None, @@ -416,6 +419,8 @@ mod tests { assert_eq!(machine.slot_number, Some(15)); assert_eq!(machine.tray_index, Some(5)); assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); + assert_eq!(machine.driver_version.as_deref(), Some("570.82")); + assert_eq!(machine.component_type, ComponentType::ComputeNode); } other => panic!("expected Machine metadata, got {other:?}"), } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fb71066fb9..c68aec7633 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -88,6 +88,15 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(machine_id) = context.machine_id() { attrs.push(kv("machine.id", machine_id.to_string())); } + if let Some(machine_serial) = context.machine_serial() { + attrs.push(kv("machine.serial", machine_serial.to_string())); + } + if let Some(driver_version) = context.driver_version() { + attrs.push(kv("driver.version", driver_version.to_string())); + } + if let Some(component_type) = context.component_type() { + attrs.push(kv("component.type", component_type.as_str().to_string())); + } if let Some(switch_id) = context.switch_id() { attrs.push(kv("switch.id", switch_id.to_string())); } @@ -305,7 +314,9 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; + use crate::endpoint::{ + BmcAddr, ComponentType, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + }; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -379,10 +390,12 @@ mod tests { machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" .parse() .expect("valid machine id"), - machine_serial: None, + machine_serial: Some("MN-001".to_string()), slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some(domain_uuid), + driver_version: Some("570.82".to_string()), + component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new("RACK_1")), }; @@ -390,6 +403,9 @@ mod tests { let attrs = resource_attributes(&context); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_1")); + assert_eq!(attr_value(&attrs, "machine.serial"), Some("MN-001")); + assert_eq!(attr_value(&attrs, "driver.version"), Some("570.82")); + assert_eq!(attr_value(&attrs, "component.type"), Some("compute_node")); assert_eq!(attr_int_value(&attrs, "machine.slot_number"), Some(15)); assert_eq!(attr_int_value(&attrs, "machine.tray_index"), Some(5)); assert_eq!( @@ -398,6 +414,42 @@ mod tests { ); } + /// Verifies that absent optional machine metadata does not emit empty resource attributes. + #[test] + fn resource_attributes_omit_absent_optional_machine_metadata() { + let context = EventContext { + endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + port: Some(443), + mac: MacAddress::from_str("42:9e:b1:bd:9d:dd").expect("valid mac"), + }, + collector_type: "test", + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, + })), + rack_id: None, + }; + + let attrs = resource_attributes(&context); + + assert_eq!( + attr_value(&attrs, "machine.id"), + Some("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") + ); + assert_eq!(attr_value(&attrs, "machine.serial"), None); + assert_eq!(attr_value(&attrs, "driver.version"), None); + assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); + } + #[test] fn resource_attributes_include_switch_placement_metadata_when_present() { let switch_id = test_switch_id("switch-a"); diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 4ccfc23089..dcc68154bf 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -261,7 +261,7 @@ mod tests { use nv_redfish::resource::Health as BmcHealth; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use crate::sink::HealthReportTarget; fn test_context() -> EventContext { @@ -281,6 +281,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 7b9a507c05..4fe6e51d43 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -30,7 +30,9 @@ use health_report::{ use nv_redfish::resource::Health as BmcHealth; use serde::Serialize; -use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; +use crate::endpoint::{ + BmcAddr, BmcEndpoint, ComponentType, EndpointMetadata, MachineData, SwitchEndpointRole, +}; use crate::metrics::MetricLabel; #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -65,32 +67,54 @@ impl EventContext { &self.endpoint_key } + /// Returns machine metadata when this context belongs to a machine endpoint. + fn machine_metadata(&self) -> Option<&MachineData> { + let Some(EndpointMetadata::Machine(machine)) = self.metadata.as_ref() else { + return None; + }; + + Some(machine) + } + + /// Returns the stable NICo machine ID when the endpoint is a machine. pub fn machine_id(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => Some(machine.machine_id), - _ => None, - } + self.machine_metadata().map(|machine| machine.machine_id) + } + + /// Returns the machine chassis serial when the endpoint is a machine. + pub fn machine_serial(&self) -> Option<&str> { + self.machine_metadata() + .and_then(|machine| machine.machine_serial.as_deref()) + } + + /// Returns the uniform GPU driver version when it is known for the machine. + pub fn driver_version(&self) -> Option<&str> { + self.machine_metadata() + .and_then(|machine| machine.driver_version.as_deref()) } + /// Returns the machine component category when the endpoint is a machine. + pub fn component_type(&self) -> Option { + self.machine_metadata() + .map(|machine| machine.component_type) + } + + /// Returns the physical rack slot number when the endpoint is a machine. pub fn slot_number(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.slot_number, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.slot_number) } + /// Returns the compute tray index when the endpoint is a machine. pub fn tray_index(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.tray_index, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.tray_index) } + /// Returns the NVLink domain UUID when the machine participates in one. pub fn nvlink_domain_uuid(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.nvlink_domain_uuid, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.nvlink_domain_uuid) } pub fn switch_id(&self) -> Option { @@ -518,6 +542,9 @@ mod tests { slot_number: Option, tray_index: Option, nvlink_domain_uuid: Option, + machine_serial: Option, + driver_version: Option, + component_type: Option, switch_id: Option, switch_serial: Option, switch_endpoint_role: Option, @@ -611,6 +638,8 @@ mod tests { slot_number: Some(7), tray_index: Some(3), nvlink_domain_uuid: Some(nvlink_domain_id()), + driver_version: Some("570.82".to_string()), + component_type: ComponentType::ComputeNode, })), ContextKind::Switch => Some(EndpointMetadata::Switch(SwitchData { id: Some(switch_id()), @@ -643,6 +672,9 @@ mod tests { slot_number: context.slot_number(), tray_index: context.tray_index(), nvlink_domain_uuid: context.nvlink_domain_uuid().map(|id| id.to_string()), + machine_serial: context.machine_serial().map(str::to_string), + driver_version: context.driver_version().map(str::to_string), + component_type: context.component_type(), switch_id: context.switch_id().map(|id| id.to_string()), switch_serial: context.switch_serial().map(str::to_string), switch_endpoint_role: context.switch_endpoint_role(), @@ -1021,6 +1053,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: None, switch_id: None, switch_serial: None, switch_endpoint_role: None, @@ -1041,6 +1076,9 @@ mod tests { slot_number: Some(7), tray_index: Some(3), nvlink_domain_uuid: Some(nvlink_domain_id().to_string()), + machine_serial: Some("MN-001".to_string()), + driver_version: Some("570.82".to_string()), + component_type: Some(ComponentType::ComputeNode), switch_id: None, switch_serial: None, switch_endpoint_role: None, @@ -1061,6 +1099,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: None, switch_id: Some(switch_id().to_string()), switch_serial: Some("SW-001".to_string()), switch_endpoint_role: Some(SwitchEndpointRole::Host), @@ -1081,6 +1122,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: None, switch_id: None, switch_serial: None, switch_endpoint_role: None, diff --git a/crates/health/src/sink/health_report.rs b/crates/health/src/sink/health_report.rs index 73c5558f1c..7a15e256ba 100644 --- a/crates/health/src/sink/health_report.rs +++ b/crates/health/src/sink/health_report.rs @@ -236,7 +236,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use crate::sink::events::{Classification, HealthReportAlert, HealthReportSuccess, Probe}; fn machine_id(value: &str) -> MachineId { @@ -272,6 +272,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/src/sink/log_file.rs b/crates/health/src/sink/log_file.rs index 5d6f44ada9..859fe8b093 100644 --- a/crates/health/src/sink/log_file.rs +++ b/crates/health/src/sink/log_file.rs @@ -81,10 +81,21 @@ impl DataSink for LogFileSink { } } +/// JSONL representation of a log event written by the file sink. #[derive(Serialize)] struct JsonLogRecord<'a> { endpoint: &'a str, collector: &'a str, + #[serde(skip_serializing_if = "Option::is_none")] + machine_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + machine_serial: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + driver_version: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + component_type: Option<&'static str>, + #[serde(skip_serializing_if = "Option::is_none")] + nvlink_domain_uuid: Option, severity: &'a str, body: &'a str, #[serde(skip_serializing_if = "Vec::is_empty")] @@ -97,6 +108,11 @@ impl<'a> JsonLogRecord<'a> { Self { endpoint: context.endpoint_key(), collector: context.collector_type, + machine_id: context.machine_id().map(|id| id.to_string()), + machine_serial: context.machine_serial(), + driver_version: context.driver_version(), + component_type: context.component_type().map(|component| component.as_str()), + nvlink_domain_uuid: context.nvlink_domain_uuid().map(|id| id.to_string()), severity: &record.severity, body: &record.body, attributes: record @@ -225,12 +241,14 @@ mod tests { use std::borrow::Cow; use std::str::FromStr; + use carbide_uuid::nvlink::NvLinkDomainId; use mac_address::MacAddress; use super::*; - use crate::endpoint::BmcAddr; + use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use crate::sink::DiagnosticLogRecord; + /// Builds a base log context without endpoint metadata. fn test_context() -> EventContext { EventContext { endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), @@ -245,6 +263,24 @@ mod tests { } } + /// Builds a log context with representative machine metadata. + fn machine_context() -> EventContext { + EventContext { + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: Some("MN-001".to_string()), + slot_number: None, + tray_index: None, + nvlink_domain_uuid: Some(NvLinkDomainId::nil()), + driver_version: Some("570.82".to_string()), + component_type: ComponentType::ComputeNode, + })), + ..test_context() + } + } + #[test] fn test_ignores_non_log_events() { let dir = tempfile::tempdir().expect("tempdir"); @@ -385,6 +421,48 @@ mod tests { assert_eq!(parent["body"], "parent log"); } + /// Verifies that machine metadata is emitted as top-level JSONL fields. + #[test] + fn test_writes_machine_metadata_as_jsonl_fields() { + let dir = tempfile::tempdir().expect("tempdir"); + let config = LogFileSinkConfig { + include_diagnostics: false, + output_dir: dir.path().to_string_lossy().into_owned(), + max_file_size: 1024 * 1024, + max_backups: 2, + }; + let sink = LogFileSink::new(&config).expect("sink"); + let ctx = machine_context(); + + let event = CollectorEvent::Log( + LogRecord { + body: "xid event".to_string(), + severity: "WARN".to_string(), + attributes: Vec::new(), + diagnostic_record: None, + } + .into(), + ); + sink.handle_event(&ctx, &event); + + let log_path = dir.path().join("health_logs.jsonl"); + let contents = fs::read_to_string(log_path).expect("read log"); + let line = contents.lines().next().expect("one JSONL record"); + + let parsed: serde_json::Value = serde_json::from_str(line).expect("valid json"); + assert_eq!( + parsed["machine_id"], + "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + ); + assert_eq!(parsed["machine_serial"], "MN-001"); + assert_eq!(parsed["driver_version"], "570.82"); + assert_eq!(parsed["component_type"], "compute_node"); + assert_eq!( + parsed["nvlink_domain_uuid"], + "00000000-0000-0000-0000-000000000000" + ); + } + #[test] fn test_rotation_creates_backups() { let dir = tempfile::tempdir().expect("tempdir"); diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index 9df648faf8..6f3ee4f3e3 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -68,7 +68,7 @@ mod tests { CollectorEvent, CompositeDataSink, DataSink, DiagnosticLogRecord, EventContext, LogRecord, MetricSample, PrometheusSink, }; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; use crate::metrics::MetricsManager; struct CountingSink { @@ -164,6 +164,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, }; @@ -236,6 +238,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, }; @@ -291,6 +295,8 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: None, }; diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 3822b24d31..8819ed6e43 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -244,7 +244,9 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; + use crate::endpoint::{ + BmcAddr, ComponentType, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + }; fn test_switch_id(label: &str) -> SwitchId { let mut hash = [0u8; 32]; @@ -271,6 +273,8 @@ mod tests { slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some(NvLinkDomainId::nil()), + driver_version: None, + component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new("RACK_1")), }; diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index a8ba685d2d..e250095539 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -82,6 +82,11 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, + machine_id = context.machine_id().map(tracing::field::display), + machine_serial = context.machine_serial(), + driver_version = context.driver_version(), + component_type = context.component_type().map(|component| component.as_str()), + nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, attributes = ?record.attributes, @@ -91,6 +96,11 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, + machine_id = context.machine_id().map(tracing::field::display), + machine_serial = context.machine_serial(), + driver_version = context.driver_version(), + component_type = context.component_type().map(|component| component.as_str()), + nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, "Log event" From b6838fcb86c5f715c2a4b13f123b31d02230d969 Mon Sep 17 00:00:00 2001 From: Jay Zhu Date: Thu, 25 Jun 2026 15:33:40 -0600 Subject: [PATCH 2/4] fix(health-log-metadata): correct component type coversion Signed-off-by: Jay Zhu --- crates/health/benches/processor_pipeline.rs | 2 - crates/health/src/api_client.rs | 71 +------------------- crates/health/src/discovery/spawn.rs | 4 +- crates/health/src/endpoint/mod.rs | 4 +- crates/health/src/endpoint/model.rs | 34 ++-------- crates/health/src/endpoint/sources.rs | 6 +- crates/health/src/otlp/convert.rs | 36 ++++++++-- crates/health/src/processor/health_report.rs | 3 +- crates/health/src/sink/events.rs | 20 +++--- crates/health/src/sink/health_report.rs | 3 +- crates/health/src/sink/log_file.rs | 5 +- crates/health/src/sink/mod.rs | 5 +- crates/health/src/sink/prometheus.rs | 5 +- crates/health/src/sink/tracing.rs | 4 +- 14 files changed, 61 insertions(+), 141 deletions(-) diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index 3e7c29b3d4..61846b495a 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -97,7 +97,6 @@ fn event_context() -> EventContext { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, } @@ -276,7 +275,6 @@ fn rack_event_contexts(rack_id: &str, tray_count: usize) -> Vec { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new(rack_id)), } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index ddfc3c4844..122d3d8379 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -34,8 +34,8 @@ use url::Url; use crate::HealthError; use crate::bmc::{BmcClient, BoxFuture, CredentialProvider}; use crate::endpoint::{ - BmcAddr, BmcCredentials, BmcEndpoint, ComponentType, EndpointMetadata, EndpointSource, - MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; /// [`ApiEndpointSource`]. @@ -445,7 +445,6 @@ impl ApiEndpointSource { .as_ref() .and_then(|info| info.domain_uuid), driver_version: unique_gpu_driver_version(machine.discovery_info.as_ref()), - component_type: machine_component_type(machine), }) }); @@ -615,20 +614,6 @@ fn unique_gpu_driver_version( .flatten() } -/// Classifies a Forge machine as the component category emitted with health telemetry. -fn machine_component_type(machine: &rpc::forge::Machine) -> ComponentType { - match rpc::forge::MachineType::try_from(machine.machine_type) { - Ok(rpc::forge::MachineType::Dpu) => ComponentType::Dpu, - Ok(rpc::forge::MachineType::Host) => ComponentType::ComputeNode, - Ok(rpc::forge::MachineType::PowerShelf | rpc::forge::MachineType::Unknown) | Err(_) => { - machine - .id - .map(|machine_id| ComponentType::from_machine_type(machine_id.machine_type())) - .unwrap_or(ComponentType::ComputeNode) - } - } -} - impl EndpointSource for ApiEndpointSource { fn fetch_bmc_hosts<'a>(&'a self) -> BoxFuture<'a, Result>, HealthError>> { Box::pin(self.fetch_bmc_hosts()) @@ -712,7 +697,6 @@ mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; use carbide_test_support::value_scenarios; - use carbide_uuid::machine::MachineId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; @@ -769,26 +753,6 @@ mod tests { } } - /// Returns a known DPU ID for fallback classification tests. - fn dpu_machine_id() -> MachineId { - "fm100ds3gfip02lfgleidqoitqgh8d8mdc4a3j2tdncbjrfjtvrrhn2kleg" - .parse() - .expect("valid DPU machine id") - } - - /// Classifies a minimal Forge machine with the supplied type and optional ID. - fn component_type_for_machine_type( - (machine_type, machine_id): (rpc::forge::MachineType, Option), - ) -> ComponentType { - let machine = rpc::forge::Machine { - machine_type: machine_type as i32, - id: machine_id, - ..Default::default() - }; - - machine_component_type(&machine) - } - /// Verifies that driver-version extraction emits only a unique non-empty value. #[test] fn unique_gpu_driver_version_uses_single_non_empty_version() { @@ -824,37 +788,6 @@ mod tests { ); } - /// Verifies that Forge host and DPU machine types map to telemetry categories. - #[test] - fn machine_component_type_uses_api_machine_type() { - value_scenarios!( - run = component_type_for_machine_type; - "host" { - (rpc::forge::MachineType::Host, None) => ComponentType::ComputeNode, - } - - "dpu" { - (rpc::forge::MachineType::Dpu, None) => ComponentType::Dpu, - } - - "unknown with dpu machine id" { - (rpc::forge::MachineType::Unknown, Some(dpu_machine_id())) => ComponentType::Dpu, - } - - "unknown without machine id" { - (rpc::forge::MachineType::Unknown, None) => ComponentType::ComputeNode, - } - - "power shelf with dpu machine id" { - (rpc::forge::MachineType::PowerShelf, Some(dpu_machine_id())) => ComponentType::Dpu, - } - - "power shelf without machine id" { - (rpc::forge::MachineType::PowerShelf, None) => ComponentType::ComputeNode, - } - ); - } - #[test] fn cache_returns_existing_client_on_matching_kind() { let mut cache: HashMap = HashMap::new(); diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index f048251083..1dd99330f1 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -564,8 +564,7 @@ mod tests { }; use crate::endpoint::test_support::endpoint_with_creds; use crate::endpoint::{ - BmcAddr, BmcCredentials, ComponentType, EndpointMetadata, MachineData, SwitchData, - SwitchEndpointRole, + BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; @@ -640,7 +639,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, }) } diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 5e67ff5648..e5e4a9e33b 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -19,8 +19,8 @@ mod model; mod sources; pub use model::{ - BmcAddr, BmcCredentials, BmcEndpoint, ComponentType, EndpointMetadata, EndpointSource, - MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; pub use sources::{CompositeEndpointSource, StaticEndpointSource}; diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index da499c45a9..4230e5a529 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -19,7 +19,7 @@ use std::borrow::Cow; use std::net::IpAddr; use std::sync::Arc; -use carbide_uuid::machine::{MachineId, MachineType}; +use carbide_uuid::machine::MachineId; use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; @@ -92,32 +92,13 @@ impl EndpointMetadata { EndpointMetadata::Switch(switch) => Some(switch.serial.as_str()), } } -} - -/// Hardware component category attached to machine health telemetry. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum ComponentType { - /// Forge-managed host compute node. - ComputeNode, - - /// Data processing unit machine. - Dpu, -} -impl ComponentType { - /// Returns the stable telemetry spelling for this component category. - pub const fn as_str(self) -> &'static str { + /// Returns the PHR component category represented by this endpoint metadata. + pub const fn component_type(&self) -> &'static str { match self { - Self::ComputeNode => "compute_node", - Self::Dpu => "dpu", - } - } - - /// Maps a NICo machine type to its health telemetry component category. - pub const fn from_machine_type(machine_type: MachineType) -> Self { - match machine_type { - MachineType::Dpu => Self::Dpu, - MachineType::Host | MachineType::PredictedHost => Self::ComputeNode, + Self::Machine(_) => "compute_node", + Self::PowerShelf(_) => "power_shelf", + Self::Switch(_) => "nvlink_switch", } } } @@ -146,9 +127,6 @@ pub struct MachineData { /// non-empty GPU driver version for the machine. It stays absent when the /// version is unknown or the discovered GPUs report conflicting versions. pub driver_version: Option, - - /// Component category derived from the machine type. - pub component_type: ComponentType, } #[derive(Clone, Debug)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index dd774c5f23..ef27f6c88a 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -28,8 +28,8 @@ use crate::HealthError; use crate::bmc::{BmcClient, FixedCredentialProvider}; use crate::config::{StaticBmcEndpoint, StaticSwitchEndpointRole}; use crate::endpoint::{ - BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, ComponentType, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; pub struct StaticEndpointSource { @@ -136,7 +136,6 @@ impl StaticEndpointSource { tray_index: machine.tray_index, nvlink_domain_uuid, driver_version: machine.driver_version.clone(), - component_type: ComponentType::from_machine_type(machine_id.machine_type()), })), Err(error) => { tracing::warn!( @@ -420,7 +419,6 @@ mod tests { assert_eq!(machine.tray_index, Some(5)); assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); assert_eq!(machine.driver_version.as_deref(), Some("570.82")); - assert_eq!(machine.component_type, ComponentType::ComputeNode); } other => panic!("expected Machine metadata, got {other:?}"), } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index c68aec7633..39e210b5cd 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -95,7 +95,7 @@ fn resource_attributes(context: &EventContext) -> Vec { attrs.push(kv("driver.version", driver_version.to_string())); } if let Some(component_type) = context.component_type() { - attrs.push(kv("component.type", component_type.as_str().to_string())); + attrs.push(kv("component.type", component_type.to_string())); } if let Some(switch_id) = context.switch_id() { attrs.push(kv("switch.id", switch_id.to_string())); @@ -309,13 +309,14 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; use super::*; use crate::endpoint::{ - BmcAddr, ComponentType, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + BmcAddr, EndpointMetadata, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, @@ -395,7 +396,6 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(domain_uuid), driver_version: Some("570.82".to_string()), - component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new("RACK_1")), }; @@ -434,7 +434,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, }; @@ -481,6 +480,7 @@ mod tests { Some(switch_id_attr.as_str()) ); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); assert_eq!(attr_int_value(&attrs, "switch.tray_index"), Some(3)); } @@ -530,6 +530,7 @@ mod tests { assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); assert_eq!(attr_value(&attrs, "collector.type"), Some("nvue_gnmi")); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); } #[test] @@ -576,6 +577,33 @@ mod tests { assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("bmc")); assert_eq!(attr_bool_value(&attrs, "switch.is_primary"), Some(false)); assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); + } + + #[test] + fn resource_attributes_include_power_shelf_component_type() { + let power_shelf_id = + PowerShelfId::from_str("ps100ht038bg3qsho433vkg684heguv282qaggmrsh2ugn1qk096n2c6hcg") + .expect("valid power shelf id"); + let context = EventContext { + endpoint_key: "33:44:55:66:77:88".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 3, 1)), + port: Some(443), + mac: MacAddress::from_str("33:44:55:66:77:88").expect("valid mac"), + }, + collector_type: "sensor_collector", + metadata: Some(EndpointMetadata::PowerShelf(PowerShelfData { + id: Some(power_shelf_id), + serial: "SN-PS-001".to_string(), + })), + rack_id: Some(RackId::new("RACK_4")), + }; + + let attrs = resource_attributes(&context); + + assert_eq!(attr_value(&attrs, "component.type"), Some("power_shelf")); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_4")); } #[test] diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index dcc68154bf..ef0805c430 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -261,7 +261,7 @@ mod tests { use nv_redfish::resource::Health as BmcHealth; use super::*; - use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::sink::HealthReportTarget; fn test_context() -> EventContext { @@ -282,7 +282,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 4fe6e51d43..48e7844163 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -30,9 +30,7 @@ use health_report::{ use nv_redfish::resource::Health as BmcHealth; use serde::Serialize; -use crate::endpoint::{ - BmcAddr, BmcEndpoint, ComponentType, EndpointMetadata, MachineData, SwitchEndpointRole, -}; +use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, MachineData, SwitchEndpointRole}; use crate::metrics::MetricLabel; #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -93,10 +91,9 @@ impl EventContext { .and_then(|machine| machine.driver_version.as_deref()) } - /// Returns the machine component category when the endpoint is a machine. - pub fn component_type(&self) -> Option { - self.machine_metadata() - .map(|machine| machine.component_type) + /// Returns the PHR component category for endpoints with typed metadata. + pub fn component_type(&self) -> Option<&'static str> { + self.metadata.as_ref().map(EndpointMetadata::component_type) } /// Returns the physical rack slot number when the endpoint is a machine. @@ -544,7 +541,7 @@ mod tests { nvlink_domain_uuid: Option, machine_serial: Option, driver_version: Option, - component_type: Option, + component_type: Option<&'static str>, switch_id: Option, switch_serial: Option, switch_endpoint_role: Option, @@ -639,7 +636,6 @@ mod tests { tray_index: Some(3), nvlink_domain_uuid: Some(nvlink_domain_id()), driver_version: Some("570.82".to_string()), - component_type: ComponentType::ComputeNode, })), ContextKind::Switch => Some(EndpointMetadata::Switch(SwitchData { id: Some(switch_id()), @@ -1078,7 +1074,7 @@ mod tests { nvlink_domain_uuid: Some(nvlink_domain_id().to_string()), machine_serial: Some("MN-001".to_string()), driver_version: Some("570.82".to_string()), - component_type: Some(ComponentType::ComputeNode), + component_type: Some("compute_node"), switch_id: None, switch_serial: None, switch_endpoint_role: None, @@ -1101,7 +1097,7 @@ mod tests { nvlink_domain_uuid: None, machine_serial: None, driver_version: None, - component_type: None, + component_type: Some("nvlink_switch"), switch_id: Some(switch_id().to_string()), switch_serial: Some("SW-001".to_string()), switch_endpoint_role: Some(SwitchEndpointRole::Host), @@ -1124,7 +1120,7 @@ mod tests { nvlink_domain_uuid: None, machine_serial: None, driver_version: None, - component_type: None, + component_type: Some("power_shelf"), switch_id: None, switch_serial: None, switch_endpoint_role: None, diff --git a/crates/health/src/sink/health_report.rs b/crates/health/src/sink/health_report.rs index 7a15e256ba..3218b79cbe 100644 --- a/crates/health/src/sink/health_report.rs +++ b/crates/health/src/sink/health_report.rs @@ -236,7 +236,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::sink::events::{Classification, HealthReportAlert, HealthReportSuccess, Probe}; fn machine_id(value: &str) -> MachineId { @@ -273,7 +273,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/src/sink/log_file.rs b/crates/health/src/sink/log_file.rs index 859fe8b093..f0d597c1bf 100644 --- a/crates/health/src/sink/log_file.rs +++ b/crates/health/src/sink/log_file.rs @@ -111,7 +111,7 @@ impl<'a> JsonLogRecord<'a> { machine_id: context.machine_id().map(|id| id.to_string()), machine_serial: context.machine_serial(), driver_version: context.driver_version(), - component_type: context.component_type().map(|component| component.as_str()), + component_type: context.component_type(), nvlink_domain_uuid: context.nvlink_domain_uuid().map(|id| id.to_string()), severity: &record.severity, body: &record.body, @@ -245,7 +245,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::sink::DiagnosticLogRecord; /// Builds a base log context without endpoint metadata. @@ -275,7 +275,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: Some(NvLinkDomainId::nil()), driver_version: Some("570.82".to_string()), - component_type: ComponentType::ComputeNode, })), ..test_context() } diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index 6f3ee4f3e3..d79734a75b 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -68,7 +68,7 @@ mod tests { CollectorEvent, CompositeDataSink, DataSink, DiagnosticLogRecord, EventContext, LogRecord, MetricSample, PrometheusSink, }; - use crate::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::metrics::MetricsManager; struct CountingSink { @@ -165,7 +165,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, }; @@ -239,7 +238,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, }; @@ -296,7 +294,6 @@ mod tests { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, }; diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 8819ed6e43..4b1679d848 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -244,9 +244,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{ - BmcAddr, ComponentType, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, - }; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; fn test_switch_id(label: &str) -> SwitchId { let mut hash = [0u8; 32]; @@ -274,7 +272,6 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(NvLinkDomainId::nil()), driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: Some(RackId::new("RACK_1")), }; diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index e250095539..34e2f78390 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -85,7 +85,7 @@ impl DataSink for TracingSink { machine_id = context.machine_id().map(tracing::field::display), machine_serial = context.machine_serial(), driver_version = context.driver_version(), - component_type = context.component_type().map(|component| component.as_str()), + component_type = context.component_type(), nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, @@ -99,7 +99,7 @@ impl DataSink for TracingSink { machine_id = context.machine_id().map(tracing::field::display), machine_serial = context.machine_serial(), driver_version = context.driver_version(), - component_type = context.component_type().map(|component| component.as_str()), + component_type = context.component_type(), nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, From bdaffaa1093229987b3c147f5e0d8f382cd88022 Mon Sep 17 00:00:00 2001 From: Jay Zhu Date: Thu, 25 Jun 2026 16:06:40 -0600 Subject: [PATCH 3/4] fix(health): remove residual `ComponentType` Signed-off-by: Jay Zhu --- crates/health/benches/collector_pipeline.rs | 3 +-- crates/health/benches/processor_pipeline.rs | 2 +- crates/health/benches/sink_pipeline.rs | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 307e620382..8cf84eef5d 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -21,7 +21,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ CollectorEvent, CompositeDataSink, DataSink, EventContext, FirmwareInfo, LogRecord, @@ -61,7 +61,6 @@ fn event_context() -> EventContext { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, } diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index 61846b495a..ca98b8630c 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -20,7 +20,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::processor::{ EventProcessingPipeline, EventProcessor, HealthReportProcessor, LeakEventProcessor, diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 3ad58c0179..c8e9e6b2e6 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -21,7 +21,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; -use carbide_health::endpoint::{BmcAddr, ComponentType, EndpointMetadata, MachineData}; +use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ Classification, CollectorEvent, CompositeDataSink, DataSink, EventContext, HealthReport, @@ -71,7 +71,6 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { tray_index: None, nvlink_domain_uuid: None, driver_version: None, - component_type: ComponentType::ComputeNode, })), rack_id: None, } From c85150a636103f6515ba187bd7d31f4279398236 Mon Sep 17 00:00:00 2001 From: Jay Zhu Date: Thu, 25 Jun 2026 17:38:17 -0600 Subject: [PATCH 4/4] fix(health): normalize static driver versions Trim static machine driver_version before storing metadata and omit it when the value is empty, matching API discovery behavior. Signed-off-by: Jay Zhu --- crates/health/src/endpoint/sources.rs | 44 +++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index ef27f6c88a..676607cf1e 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -128,6 +128,13 @@ impl StaticEndpointSource { }, ); + let driver_version = machine + .driver_version + .as_deref() + .map(str::trim) + .filter(|driver_version| !driver_version.is_empty()) + .map(str::to_string); + match machine_id.parse() { Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData { machine_id, @@ -135,7 +142,7 @@ impl StaticEndpointSource { slot_number: machine.slot_number, tray_index: machine.tray_index, nvlink_domain_uuid, - driver_version: machine.driver_version.clone(), + driver_version, })), Err(error) => { tracing::warn!( @@ -393,7 +400,7 @@ mod tests { slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), - driver_version: Some("570.82".to_string()), + driver_version: Some(" 570.82 ".to_string()), }), power_shelf: None, switch: None, @@ -424,6 +431,39 @@ mod tests { } } + #[tokio::test] + async fn test_static_machine_endpoint_omits_empty_driver_version() { + let configs = vec![StaticBmcEndpoint { + ip: ip("10.0.1.3"), + port: Some(443), + mac: "11:22:33:44:55:12".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: Some(StaticMachineEndpoint { + id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(), + serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + driver_version: Some(" ".to_string()), + }), + power_shelf: None, + switch: None, + rack_id: None, + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + match &endpoints[0].metadata { + Some(EndpointMetadata::Machine(machine)) => { + assert_eq!(machine.driver_version, None); + } + other => panic!("expected Machine metadata, got {other:?}"), + } + } + #[tokio::test] async fn test_static_endpoint_without_switch_serial_has_no_metadata() { let configs = vec![StaticBmcEndpoint {