Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/health/benches/collector_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ fn event_context() -> EventContext {
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: None,
})),
rack_id: None,
}
Expand Down
2 changes: 2 additions & 0 deletions crates/health/benches/processor_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ fn event_context() -> EventContext {
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: None,
})),
rack_id: None,
}
Expand Down Expand Up @@ -273,6 +274,7 @@ fn rack_event_contexts(rack_id: &str, tray_count: usize) -> Vec<EventContext> {
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: None,
})),
rack_id: Some(RackId::new(rack_id)),
}
Expand Down
1 change: 1 addition & 0 deletions crates/health/benches/sink_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ fn event_context_for_machine(machine_id: &str) -> EventContext {
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: None,
})),
rack_id: None,
}
Expand Down
76 changes: 76 additions & 0 deletions crates/health/src/api_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ impl ApiEndpointSource {
.nvlink_info
.as_ref()
.and_then(|info| info.domain_uuid),
driver_version: unique_gpu_driver_version(machine.discovery_info.as_ref()),
})
});

Expand Down Expand Up @@ -590,6 +591,29 @@ fn cache_or_create_bmc_client(
Ok(client)
}

/// Returns the machine-level GPU driver version derived from discovery data.
///
/// The NICo API reports driver versions per GPU. Health emits one machine-level
/// value only when there is exactly one unique non-empty version across the
/// reported GPUs. Empty strings are treated as missing data; conflicting
/// non-empty versions are treated as ambiguous and omitted.
fn unique_gpu_driver_version(
discovery_info: Option<&rpc::machine_discovery::DiscoveryInfo>,
) -> Option<String> {
let discovery_info = discovery_info?;
let versions = discovery_info
.gpus
.iter()
.map(|gpu| gpu.driver_version.trim())
.filter(|version| !version.is_empty())
.map(str::to_string)
.collect::<HashSet<_>>();

(versions.len() == 1)
.then(|| versions.into_iter().next())
.flatten()
}

impl EndpointSource for ApiEndpointSource {
fn fetch_bmc_hosts<'a>(&'a self) -> BoxFuture<'a, Result<Vec<Arc<BmcEndpoint>>, HealthError>> {
Box::pin(self.fetch_bmc_hosts())
Expand Down Expand Up @@ -672,6 +696,7 @@ impl From<rpc::forge::bmc_credentials::Type> for BmcCredentials {
mod tests {
use std::sync::atomic::{AtomicUsize, Ordering};

use carbide_test_support::value_scenarios;
use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType};
use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams;

Expand Down Expand Up @@ -712,6 +737,57 @@ mod tests {
)?))
}

/// Builds discovery metadata with one GPU entry per supplied driver version.
fn discovery_with_driver_versions(
driver_versions: &[&str],
) -> rpc::machine_discovery::DiscoveryInfo {
rpc::machine_discovery::DiscoveryInfo {
gpus: driver_versions
.iter()
.map(|driver_version| rpc::machine_discovery::Gpu {
driver_version: (*driver_version).to_string(),
..Default::default()
})
.collect(),
..Default::default()
}
}

/// Verifies that driver-version extraction emits only a unique non-empty value.
#[test]
fn unique_gpu_driver_version_uses_single_non_empty_version() {
value_scenarios!(
run = |discovery_info: Option<rpc::machine_discovery::DiscoveryInfo>| {
unique_gpu_driver_version(discovery_info.as_ref())
};
"missing discovery info" {
None => None,
}

"no gpus" {
Some(discovery_with_driver_versions(&[])) => None,
}

"empty gpu driver versions" {
Some(discovery_with_driver_versions(&["", " "])) => None,
}

"one gpu driver version" {
Some(discovery_with_driver_versions(&["570.82"])) => Some("570.82".to_string()),
}

"same gpu driver version repeated" {
Some(discovery_with_driver_versions(&["570.82", " 570.82 "])) => {
Some("570.82".to_string())
},
}

"mixed gpu driver versions" {
Some(discovery_with_driver_versions(&["570.82", "580.12"])) => None,
}
);
}

#[test]
fn cache_returns_existing_client_on_matching_kind() {
let mut cache: HashMap<MacAddress, CachedBmcClient> = HashMap::new();
Expand Down
13 changes: 12 additions & 1 deletion crates/health/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,22 @@ pub struct StaticBmcEndpoint {
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
#[serde(deny_unknown_fields)]
pub struct StaticMachineEndpoint {
/// Stable NICo machine ID for this BMC endpoint.
pub id: String,

/// Optional chassis serial to emit as machine telemetry metadata.
pub serial: Option<String>,

/// Optional uniform GPU driver version to emit for local/static validation.
pub driver_version: Option<String>,

#[serde(alias = "physical_slot_number")]
pub slot_number: Option<i32>,

#[serde(alias = "compute_tray_index")]
pub tray_index: Option<i32>,

/// Optional NVLink domain UUID associated with this machine.
pub nvlink_domain_uuid: Option<String>,
}

Expand Down Expand Up @@ -2051,7 +2061,7 @@ ip = "10.0.1.2"
mac = "11:22:33:44:55:11"
username = "admin"
password = "pass"
machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" }
machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", driver_version = "570.82", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" }
"#;

let config: Config = Figment::new()
Expand All @@ -2067,6 +2077,7 @@ machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0",

assert_eq!(machine.slot_number, Some(15));
assert_eq!(machine.tray_index, Some(5));
assert_eq!(machine.driver_version.as_deref(), Some("570.82"));
assert_eq!(
machine.nvlink_domain_uuid.as_deref(),
Some("00000000-0000-0000-0000-000000000000")
Expand Down
1 change: 1 addition & 0 deletions crates/health/src/discovery/spawn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ mod tests {
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: None,
})
}

Expand Down
26 changes: 26 additions & 0 deletions crates/health/src/endpoint/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,41 @@ impl EndpointMetadata {
EndpointMetadata::Switch(switch) => Some(switch.serial.as_str()),
}
}

/// Returns the PHR component category represented by this endpoint metadata.
pub const fn component_type(&self) -> &'static str {
match self {
Self::Machine(_) => "compute_node",
Self::PowerShelf(_) => "power_shelf",
Self::Switch(_) => "nvlink_switch",
}
}
}

/// Metadata that describes a machine endpoint for health telemetry.
#[derive(Clone, Debug)]
pub struct MachineData {
/// Stable NICo machine identifier.
pub machine_id: MachineId,

/// Hardware chassis serial discovered from machine DMI data, when known.
pub machine_serial: Option<String>,

/// Physical rack slot where the machine is installed, when known.
pub slot_number: Option<i32>,

/// Compute tray index where the machine is installed, when known.
pub tray_index: Option<i32>,

/// NVLink domain UUID for the machine, when it participates in an NVLink domain.
pub nvlink_domain_uuid: Option<NvLinkDomainId>,

/// Machine-level GPU driver version.
///
/// This is populated only when API discovery reports exactly one unique
/// non-empty GPU driver version for the machine. It stays absent when the
/// version is unknown or the discovered GPUs report conflicting versions.
pub driver_version: Option<String>,
}

#[derive(Clone, Debug)]
Expand Down
43 changes: 43 additions & 0 deletions crates/health/src/endpoint/sources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,21 @@ impl StaticEndpointSource {
},
);

let driver_version = machine
.driver_version
.as_deref()
.map(str::trim)
.filter(|driver_version| !driver_version.is_empty())
.map(str::to_string);

match machine_id.parse() {
Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData {
machine_id,
machine_serial: machine.serial.clone(),
slot_number: machine.slot_number,
tray_index: machine.tray_index,
nvlink_domain_uuid,
driver_version,
})),
Err(error) => {
tracing::warn!(
Expand Down Expand Up @@ -392,6 +400,7 @@ mod tests {
slot_number: Some(15),
tray_index: Some(5),
nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()),
driver_version: Some(" 570.82 ".to_string()),
}),
power_shelf: None,
switch: None,
Expand All @@ -416,6 +425,40 @@ mod tests {
assert_eq!(machine.slot_number, Some(15));
assert_eq!(machine.tray_index, Some(5));
assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid));
assert_eq!(machine.driver_version.as_deref(), Some("570.82"));
}
other => panic!("expected Machine metadata, got {other:?}"),
}
}

#[tokio::test]
async fn test_static_machine_endpoint_omits_empty_driver_version() {
let configs = vec![StaticBmcEndpoint {
ip: ip("10.0.1.3"),
port: Some(443),
mac: "11:22:33:44:55:12".to_string(),
username: "admin".to_string(),
password: Some("pass".to_string()),
machine: Some(StaticMachineEndpoint {
id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(),
serial: None,
slot_number: None,
tray_index: None,
nvlink_domain_uuid: None,
driver_version: Some(" ".to_string()),
}),
power_shelf: None,
switch: None,
rack_id: None,
}];

let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10);
let endpoints = source.fetch_bmc_hosts().await.unwrap();

assert_eq!(endpoints.len(), 1);
match &endpoints[0].metadata {
Some(EndpointMetadata::Machine(machine)) => {
assert_eq!(machine.driver_version, None);
}
other => panic!("expected Machine metadata, got {other:?}"),
}
Expand Down
Loading
Loading