diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 273865b714..8cf84eef5d 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -60,6 +60,7 @@ fn event_context() -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, } diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index d005eac8df..ca98b8630c 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -96,6 +96,7 @@ fn event_context() -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, } @@ -273,6 +274,7 @@ fn rack_event_contexts(rack_id: &str, tray_count: usize) -> Vec { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: Some(RackId::new(rack_id)), } diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 0e8e9d730f..c8e9e6b2e6 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -70,6 +70,7 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 5ffe158532..122d3d8379 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -444,6 +444,7 @@ impl ApiEndpointSource { .nvlink_info .as_ref() .and_then(|info| info.domain_uuid), + driver_version: unique_gpu_driver_version(machine.discovery_info.as_ref()), }) }); @@ -590,6 +591,29 @@ fn cache_or_create_bmc_client( Ok(client) } +/// Returns the machine-level GPU driver version derived from discovery data. +/// +/// The NICo API reports driver versions per GPU. Health emits one machine-level +/// value only when there is exactly one unique non-empty version across the +/// reported GPUs. Empty strings are treated as missing data; conflicting +/// non-empty versions are treated as ambiguous and omitted. +fn unique_gpu_driver_version( + discovery_info: Option<&rpc::machine_discovery::DiscoveryInfo>, +) -> Option { + let discovery_info = discovery_info?; + let versions = discovery_info + .gpus + .iter() + .map(|gpu| gpu.driver_version.trim()) + .filter(|version| !version.is_empty()) + .map(str::to_string) + .collect::>(); + + (versions.len() == 1) + .then(|| versions.into_iter().next()) + .flatten() +} + impl EndpointSource for ApiEndpointSource { fn fetch_bmc_hosts<'a>(&'a self) -> BoxFuture<'a, Result>, HealthError>> { Box::pin(self.fetch_bmc_hosts()) @@ -672,6 +696,7 @@ impl From for BmcCredentials { mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; + use carbide_test_support::value_scenarios; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; @@ -712,6 +737,57 @@ mod tests { )?)) } + /// Builds discovery metadata with one GPU entry per supplied driver version. + fn discovery_with_driver_versions( + driver_versions: &[&str], + ) -> rpc::machine_discovery::DiscoveryInfo { + rpc::machine_discovery::DiscoveryInfo { + gpus: driver_versions + .iter() + .map(|driver_version| rpc::machine_discovery::Gpu { + driver_version: (*driver_version).to_string(), + ..Default::default() + }) + .collect(), + ..Default::default() + } + } + + /// Verifies that driver-version extraction emits only a unique non-empty value. + #[test] + fn unique_gpu_driver_version_uses_single_non_empty_version() { + value_scenarios!( + run = |discovery_info: Option| { + unique_gpu_driver_version(discovery_info.as_ref()) + }; + "missing discovery info" { + None => None, + } + + "no gpus" { + Some(discovery_with_driver_versions(&[])) => None, + } + + "empty gpu driver versions" { + Some(discovery_with_driver_versions(&["", " "])) => None, + } + + "one gpu driver version" { + Some(discovery_with_driver_versions(&["570.82"])) => Some("570.82".to_string()), + } + + "same gpu driver version repeated" { + Some(discovery_with_driver_versions(&["570.82", " 570.82 "])) => { + Some("570.82".to_string()) + }, + } + + "mixed gpu driver versions" { + Some(discovery_with_driver_versions(&["570.82", "580.12"])) => None, + } + ); + } + #[test] fn cache_returns_existing_client_on_matching_kind() { let mut cache: HashMap = HashMap::new(); diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 3f1c035562..d33cc0a9b4 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -114,12 +114,22 @@ pub struct StaticBmcEndpoint { #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticMachineEndpoint { + /// Stable NICo machine ID for this BMC endpoint. pub id: String, + + /// Optional chassis serial to emit as machine telemetry metadata. pub serial: Option, + + /// Optional uniform GPU driver version to emit for local/static validation. + pub driver_version: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, + + /// Optional NVLink domain UUID associated with this machine. pub nvlink_domain_uuid: Option, } @@ -2051,7 +2061,7 @@ ip = "10.0.1.2" mac = "11:22:33:44:55:11" username = "admin" password = "pass" -machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", driver_version = "570.82", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } "#; let config: Config = Figment::new() @@ -2067,6 +2077,7 @@ machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(machine.slot_number, Some(15)); assert_eq!(machine.tray_index, Some(5)); + assert_eq!(machine.driver_version.as_deref(), Some("570.82")); assert_eq!( machine.nvlink_domain_uuid.as_deref(), Some("00000000-0000-0000-0000-000000000000") diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d0c045dba1..1dd99330f1 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -638,6 +638,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, }) } diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 80d318075d..4230e5a529 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -92,15 +92,41 @@ impl EndpointMetadata { EndpointMetadata::Switch(switch) => Some(switch.serial.as_str()), } } + + /// Returns the PHR component category represented by this endpoint metadata. + pub const fn component_type(&self) -> &'static str { + match self { + Self::Machine(_) => "compute_node", + Self::PowerShelf(_) => "power_shelf", + Self::Switch(_) => "nvlink_switch", + } + } } +/// Metadata that describes a machine endpoint for health telemetry. #[derive(Clone, Debug)] pub struct MachineData { + /// Stable NICo machine identifier. pub machine_id: MachineId, + + /// Hardware chassis serial discovered from machine DMI data, when known. pub machine_serial: Option, + + /// Physical rack slot where the machine is installed, when known. pub slot_number: Option, + + /// Compute tray index where the machine is installed, when known. pub tray_index: Option, + + /// NVLink domain UUID for the machine, when it participates in an NVLink domain. pub nvlink_domain_uuid: Option, + + /// Machine-level GPU driver version. + /// + /// This is populated only when API discovery reports exactly one unique + /// non-empty GPU driver version for the machine. It stays absent when the + /// version is unknown or the discovered GPUs report conflicting versions. + pub driver_version: Option, } #[derive(Clone, Debug)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index cad243113d..676607cf1e 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -128,6 +128,13 @@ impl StaticEndpointSource { }, ); + let driver_version = machine + .driver_version + .as_deref() + .map(str::trim) + .filter(|driver_version| !driver_version.is_empty()) + .map(str::to_string); + match machine_id.parse() { Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData { machine_id, @@ -135,6 +142,7 @@ impl StaticEndpointSource { slot_number: machine.slot_number, tray_index: machine.tray_index, nvlink_domain_uuid, + driver_version, })), Err(error) => { tracing::warn!( @@ -392,6 +400,7 @@ mod tests { slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), + driver_version: Some(" 570.82 ".to_string()), }), power_shelf: None, switch: None, @@ -416,6 +425,40 @@ mod tests { assert_eq!(machine.slot_number, Some(15)); assert_eq!(machine.tray_index, Some(5)); assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); + assert_eq!(machine.driver_version.as_deref(), Some("570.82")); + } + other => panic!("expected Machine metadata, got {other:?}"), + } + } + + #[tokio::test] + async fn test_static_machine_endpoint_omits_empty_driver_version() { + let configs = vec![StaticBmcEndpoint { + ip: ip("10.0.1.3"), + port: Some(443), + mac: "11:22:33:44:55:12".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: Some(StaticMachineEndpoint { + id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(), + serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + driver_version: Some(" ".to_string()), + }), + power_shelf: None, + switch: None, + rack_id: None, + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + match &endpoints[0].metadata { + Some(EndpointMetadata::Machine(machine)) => { + assert_eq!(machine.driver_version, None); } other => panic!("expected Machine metadata, got {other:?}"), } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fb71066fb9..39e210b5cd 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -88,6 +88,15 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(machine_id) = context.machine_id() { attrs.push(kv("machine.id", machine_id.to_string())); } + if let Some(machine_serial) = context.machine_serial() { + attrs.push(kv("machine.serial", machine_serial.to_string())); + } + if let Some(driver_version) = context.driver_version() { + attrs.push(kv("driver.version", driver_version.to_string())); + } + if let Some(component_type) = context.component_type() { + attrs.push(kv("component.type", component_type.to_string())); + } if let Some(switch_id) = context.switch_id() { attrs.push(kv("switch.id", switch_id.to_string())); } @@ -300,12 +309,15 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; + use crate::endpoint::{ + BmcAddr, EndpointMetadata, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + }; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -379,10 +391,11 @@ mod tests { machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" .parse() .expect("valid machine id"), - machine_serial: None, + machine_serial: Some("MN-001".to_string()), slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some(domain_uuid), + driver_version: Some("570.82".to_string()), })), rack_id: Some(RackId::new("RACK_1")), }; @@ -390,6 +403,9 @@ mod tests { let attrs = resource_attributes(&context); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_1")); + assert_eq!(attr_value(&attrs, "machine.serial"), Some("MN-001")); + assert_eq!(attr_value(&attrs, "driver.version"), Some("570.82")); + assert_eq!(attr_value(&attrs, "component.type"), Some("compute_node")); assert_eq!(attr_int_value(&attrs, "machine.slot_number"), Some(15)); assert_eq!(attr_int_value(&attrs, "machine.tray_index"), Some(5)); assert_eq!( @@ -398,6 +414,41 @@ mod tests { ); } + /// Verifies that absent optional machine metadata does not emit empty resource attributes. + #[test] + fn resource_attributes_omit_absent_optional_machine_metadata() { + let context = EventContext { + endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + port: Some(443), + mac: MacAddress::from_str("42:9e:b1:bd:9d:dd").expect("valid mac"), + }, + collector_type: "test", + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + driver_version: None, + })), + rack_id: None, + }; + + let attrs = resource_attributes(&context); + + assert_eq!( + attr_value(&attrs, "machine.id"), + Some("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") + ); + assert_eq!(attr_value(&attrs, "machine.serial"), None); + assert_eq!(attr_value(&attrs, "driver.version"), None); + assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); + } + #[test] fn resource_attributes_include_switch_placement_metadata_when_present() { let switch_id = test_switch_id("switch-a"); @@ -429,6 +480,7 @@ mod tests { Some(switch_id_attr.as_str()) ); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); assert_eq!(attr_int_value(&attrs, "switch.tray_index"), Some(3)); } @@ -478,6 +530,7 @@ mod tests { assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); assert_eq!(attr_value(&attrs, "collector.type"), Some("nvue_gnmi")); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); } #[test] @@ -524,6 +577,33 @@ mod tests { assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("bmc")); assert_eq!(attr_bool_value(&attrs, "switch.is_primary"), Some(false)); assert_eq!(attr_value(&attrs, "nvlink.domain.uuid"), None); + assert_eq!(attr_value(&attrs, "component.type"), Some("nvlink_switch")); + } + + #[test] + fn resource_attributes_include_power_shelf_component_type() { + let power_shelf_id = + PowerShelfId::from_str("ps100ht038bg3qsho433vkg684heguv282qaggmrsh2ugn1qk096n2c6hcg") + .expect("valid power shelf id"); + let context = EventContext { + endpoint_key: "33:44:55:66:77:88".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 3, 1)), + port: Some(443), + mac: MacAddress::from_str("33:44:55:66:77:88").expect("valid mac"), + }, + collector_type: "sensor_collector", + metadata: Some(EndpointMetadata::PowerShelf(PowerShelfData { + id: Some(power_shelf_id), + serial: "SN-PS-001".to_string(), + })), + rack_id: Some(RackId::new("RACK_4")), + }; + + let attrs = resource_attributes(&context); + + assert_eq!(attr_value(&attrs, "component.type"), Some("power_shelf")); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_4")); } #[test] diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 4ccfc23089..ef0805c430 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -281,6 +281,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 7b9a507c05..48e7844163 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -30,7 +30,7 @@ use health_report::{ use nv_redfish::resource::Health as BmcHealth; use serde::Serialize; -use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; +use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, MachineData, SwitchEndpointRole}; use crate::metrics::MetricLabel; #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -65,32 +65,53 @@ impl EventContext { &self.endpoint_key } + /// Returns machine metadata when this context belongs to a machine endpoint. + fn machine_metadata(&self) -> Option<&MachineData> { + let Some(EndpointMetadata::Machine(machine)) = self.metadata.as_ref() else { + return None; + }; + + Some(machine) + } + + /// Returns the stable NICo machine ID when the endpoint is a machine. pub fn machine_id(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => Some(machine.machine_id), - _ => None, - } + self.machine_metadata().map(|machine| machine.machine_id) + } + + /// Returns the machine chassis serial when the endpoint is a machine. + pub fn machine_serial(&self) -> Option<&str> { + self.machine_metadata() + .and_then(|machine| machine.machine_serial.as_deref()) + } + + /// Returns the uniform GPU driver version when it is known for the machine. + pub fn driver_version(&self) -> Option<&str> { + self.machine_metadata() + .and_then(|machine| machine.driver_version.as_deref()) } + /// Returns the PHR component category for endpoints with typed metadata. + pub fn component_type(&self) -> Option<&'static str> { + self.metadata.as_ref().map(EndpointMetadata::component_type) + } + + /// Returns the physical rack slot number when the endpoint is a machine. pub fn slot_number(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.slot_number, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.slot_number) } + /// Returns the compute tray index when the endpoint is a machine. pub fn tray_index(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.tray_index, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.tray_index) } + /// Returns the NVLink domain UUID when the machine participates in one. pub fn nvlink_domain_uuid(&self) -> Option { - match &self.metadata { - Some(EndpointMetadata::Machine(machine)) => machine.nvlink_domain_uuid, - _ => None, - } + self.machine_metadata() + .and_then(|machine| machine.nvlink_domain_uuid) } pub fn switch_id(&self) -> Option { @@ -518,6 +539,9 @@ mod tests { slot_number: Option, tray_index: Option, nvlink_domain_uuid: Option, + machine_serial: Option, + driver_version: Option, + component_type: Option<&'static str>, switch_id: Option, switch_serial: Option, switch_endpoint_role: Option, @@ -611,6 +635,7 @@ mod tests { slot_number: Some(7), tray_index: Some(3), nvlink_domain_uuid: Some(nvlink_domain_id()), + driver_version: Some("570.82".to_string()), })), ContextKind::Switch => Some(EndpointMetadata::Switch(SwitchData { id: Some(switch_id()), @@ -643,6 +668,9 @@ mod tests { slot_number: context.slot_number(), tray_index: context.tray_index(), nvlink_domain_uuid: context.nvlink_domain_uuid().map(|id| id.to_string()), + machine_serial: context.machine_serial().map(str::to_string), + driver_version: context.driver_version().map(str::to_string), + component_type: context.component_type(), switch_id: context.switch_id().map(|id| id.to_string()), switch_serial: context.switch_serial().map(str::to_string), switch_endpoint_role: context.switch_endpoint_role(), @@ -1021,6 +1049,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: None, switch_id: None, switch_serial: None, switch_endpoint_role: None, @@ -1041,6 +1072,9 @@ mod tests { slot_number: Some(7), tray_index: Some(3), nvlink_domain_uuid: Some(nvlink_domain_id().to_string()), + machine_serial: Some("MN-001".to_string()), + driver_version: Some("570.82".to_string()), + component_type: Some("compute_node"), switch_id: None, switch_serial: None, switch_endpoint_role: None, @@ -1061,6 +1095,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: Some("nvlink_switch"), switch_id: Some(switch_id().to_string()), switch_serial: Some("SW-001".to_string()), switch_endpoint_role: Some(SwitchEndpointRole::Host), @@ -1081,6 +1118,9 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + machine_serial: None, + driver_version: None, + component_type: Some("power_shelf"), switch_id: None, switch_serial: None, switch_endpoint_role: None, diff --git a/crates/health/src/sink/health_report.rs b/crates/health/src/sink/health_report.rs index 73c5558f1c..3218b79cbe 100644 --- a/crates/health/src/sink/health_report.rs +++ b/crates/health/src/sink/health_report.rs @@ -272,6 +272,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, } diff --git a/crates/health/src/sink/log_file.rs b/crates/health/src/sink/log_file.rs index 5d6f44ada9..f0d597c1bf 100644 --- a/crates/health/src/sink/log_file.rs +++ b/crates/health/src/sink/log_file.rs @@ -81,10 +81,21 @@ impl DataSink for LogFileSink { } } +/// JSONL representation of a log event written by the file sink. #[derive(Serialize)] struct JsonLogRecord<'a> { endpoint: &'a str, collector: &'a str, + #[serde(skip_serializing_if = "Option::is_none")] + machine_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + machine_serial: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + driver_version: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + component_type: Option<&'static str>, + #[serde(skip_serializing_if = "Option::is_none")] + nvlink_domain_uuid: Option, severity: &'a str, body: &'a str, #[serde(skip_serializing_if = "Vec::is_empty")] @@ -97,6 +108,11 @@ impl<'a> JsonLogRecord<'a> { Self { endpoint: context.endpoint_key(), collector: context.collector_type, + machine_id: context.machine_id().map(|id| id.to_string()), + machine_serial: context.machine_serial(), + driver_version: context.driver_version(), + component_type: context.component_type(), + nvlink_domain_uuid: context.nvlink_domain_uuid().map(|id| id.to_string()), severity: &record.severity, body: &record.body, attributes: record @@ -225,12 +241,14 @@ mod tests { use std::borrow::Cow; use std::str::FromStr; + use carbide_uuid::nvlink::NvLinkDomainId; use mac_address::MacAddress; use super::*; - use crate::endpoint::BmcAddr; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::sink::DiagnosticLogRecord; + /// Builds a base log context without endpoint metadata. fn test_context() -> EventContext { EventContext { endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), @@ -245,6 +263,23 @@ mod tests { } } + /// Builds a log context with representative machine metadata. + fn machine_context() -> EventContext { + EventContext { + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: Some("MN-001".to_string()), + slot_number: None, + tray_index: None, + nvlink_domain_uuid: Some(NvLinkDomainId::nil()), + driver_version: Some("570.82".to_string()), + })), + ..test_context() + } + } + #[test] fn test_ignores_non_log_events() { let dir = tempfile::tempdir().expect("tempdir"); @@ -385,6 +420,48 @@ mod tests { assert_eq!(parent["body"], "parent log"); } + /// Verifies that machine metadata is emitted as top-level JSONL fields. + #[test] + fn test_writes_machine_metadata_as_jsonl_fields() { + let dir = tempfile::tempdir().expect("tempdir"); + let config = LogFileSinkConfig { + include_diagnostics: false, + output_dir: dir.path().to_string_lossy().into_owned(), + max_file_size: 1024 * 1024, + max_backups: 2, + }; + let sink = LogFileSink::new(&config).expect("sink"); + let ctx = machine_context(); + + let event = CollectorEvent::Log( + LogRecord { + body: "xid event".to_string(), + severity: "WARN".to_string(), + attributes: Vec::new(), + diagnostic_record: None, + } + .into(), + ); + sink.handle_event(&ctx, &event); + + let log_path = dir.path().join("health_logs.jsonl"); + let contents = fs::read_to_string(log_path).expect("read log"); + let line = contents.lines().next().expect("one JSONL record"); + + let parsed: serde_json::Value = serde_json::from_str(line).expect("valid json"); + assert_eq!( + parsed["machine_id"], + "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + ); + assert_eq!(parsed["machine_serial"], "MN-001"); + assert_eq!(parsed["driver_version"], "570.82"); + assert_eq!(parsed["component_type"], "compute_node"); + assert_eq!( + parsed["nvlink_domain_uuid"], + "00000000-0000-0000-0000-000000000000" + ); + } + #[test] fn test_rotation_creates_backups() { let dir = tempfile::tempdir().expect("tempdir"); diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index 9df648faf8..d79734a75b 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -164,6 +164,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, }; @@ -236,6 +237,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, }; @@ -291,6 +293,7 @@ mod tests { slot_number: None, tray_index: None, nvlink_domain_uuid: None, + driver_version: None, })), rack_id: None, }; diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 3822b24d31..4b1679d848 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -271,6 +271,7 @@ mod tests { slot_number: Some(15), tray_index: Some(5), nvlink_domain_uuid: Some(NvLinkDomainId::nil()), + driver_version: None, })), rack_id: Some(RackId::new("RACK_1")), }; diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index a8ba685d2d..34e2f78390 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -82,6 +82,11 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, + machine_id = context.machine_id().map(tracing::field::display), + machine_serial = context.machine_serial(), + driver_version = context.driver_version(), + component_type = context.component_type(), + nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, attributes = ?record.attributes, @@ -91,6 +96,11 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, + machine_id = context.machine_id().map(tracing::field::display), + machine_serial = context.machine_serial(), + driver_version = context.driver_version(), + component_type = context.component_type(), + nvlink_domain_uuid = context.nvlink_domain_uuid().map(tracing::field::display), severity = %record.severity, body = %record.body, "Log event"