diff --git a/Cargo.lock b/Cargo.lock index ce8febd5d5..5dcb786210 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1893,15 +1893,16 @@ dependencies = [ "eyre", "futures-util", "hex", - "md5 0.8.0", "regex", "reqwest 0.13.4", "serde", + "sha2 0.11.0", "sqlx", "temp-dir", "tokio", "toml 1.1.2+spec-1.1.0", "tracing", + "url", ] [[package]] @@ -6830,12 +6831,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" -[[package]] -name = "md5" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae960838283323069879657ca3de837e9f7bbb4c7bf6ea7f1b290d5e9476d2e0" - [[package]] name = "memchr" version = "2.8.1" @@ -9494,7 +9489,7 @@ dependencies = [ "internal-russh-num-bigint", "keccak", "log", - "md5 0.7.0", + "md5", "ml-kem", "module-lattice", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index b59558f7e2..82a4449701 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -157,7 +157,6 @@ local-ip-address = "0.6.3" log = "0.4.19" lru = "0.18.0" mac_address = "1.1" -md5 = "0.8" metrics = "0.24" metrics-exporter-prometheus = "0.18" mime = "0.3" diff --git a/crates/api-model/src/firmware.rs b/crates/api-model/src/firmware.rs index 40573eb4a2..5a8e735894 100644 --- a/crates/api-model/src/firmware.rs +++ b/crates/api-model/src/firmware.rs @@ -20,6 +20,7 @@ use std::fmt::{Debug, Display}; use std::path::PathBuf; use regex::Regex; +use serde::de::{self, Deserializer}; use serde::{Deserialize, Serialize}; use crate::site_explorer::EndpointExplorationReport; @@ -205,13 +206,57 @@ pub struct FirmwareEntry { pub scout: Option, } -#[derive(Clone, Debug, Deserialize, Serialize, Default)] +#[derive(Clone, Debug, Serialize, Default)] pub struct FirmwareFileArtifact { - pub filename: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub filename: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, pub sha256: String, } -#[derive(Clone, Debug, Deserialize, Serialize, Default)] +#[derive(Deserialize)] +struct FirmwareFileArtifactWire { + #[serde(default)] + filename: Option, + #[serde(default)] + url: Option, + sha256: String, +} + +// Transitional validation while firmware metadata supports both local artifacts +// and URL-based artifacts. Once metadata is fully URL-based, `url` should +// become required and `filename` can be removed (and this impl block too). +impl<'de> Deserialize<'de> for FirmwareFileArtifact { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let wire = FirmwareFileArtifactWire::deserialize(deserializer)?; + if !firmware_file_artifact_location_is_set(&wire.filename) + && !firmware_file_artifact_location_is_set(&wire.url) + { + return Err(de::Error::custom( + "firmware files[] artifact must set filename or url", + )); + } + + Ok(Self { + filename: wire.filename, + url: wire.url, + sha256: wire.sha256, + }) + } +} + +fn firmware_file_artifact_location_is_set(value: &Option) -> bool { + value + .as_deref() + .map(str::trim) + .is_some_and(|value| !value.is_empty()) +} + +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ScoutConfig { /// Legacy script metadata accepted for backwards-compatible config parsing. /// Scout script selection is inferred from the PXE script registry. @@ -276,6 +321,18 @@ impl FirmwareEntry { ret } + pub fn artifact_count(&self) -> usize { + if !self.files.is_empty() { + self.files.len() + } else if !self.filenames.is_empty() { + self.filenames.len() + } else if self.filename.is_some() || self.url.is_some() { + 1 + } else { + 0 + } + } + pub fn get_filename(&self, pos: u32) -> PathBuf { let pos = pos.try_into().unwrap_or(usize::MAX); let filename = if self.filenames.is_empty() { @@ -326,3 +383,135 @@ impl Display for AgentUpgradePolicyChoice { std::fmt::Debug::fmt(&self, f) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn artifact_count_prefers_files_then_legacy_fields() { + let files_count = FirmwareEntry { + files: vec![ + FirmwareFileArtifact { + filename: Some("first.bin".to_string()), + url: None, + sha256: "abc123".to_string(), + }, + FirmwareFileArtifact { + filename: Some("second.bin".to_string()), + url: None, + sha256: "def456".to_string(), + }, + ], + filenames: vec!["legacy.bin".to_string()], + filename: Some("single-legacy.bin".to_string()), + ..FirmwareEntry::default() + }; + assert_eq!(files_count.artifact_count(), 2); + + let filenames_count = FirmwareEntry { + filenames: vec!["first.bin".to_string(), "second.bin".to_string()], + filename: Some("single-legacy.bin".to_string()), + ..FirmwareEntry::default() + }; + assert_eq!(filenames_count.artifact_count(), 2); + + let filename_count = FirmwareEntry { + filename: Some("single-legacy.bin".to_string()), + ..FirmwareEntry::default() + }; + assert_eq!(filename_count.artifact_count(), 1); + + let url_count = FirmwareEntry { + url: Some("https://firmware.example.invalid/fw.bin".to_string()), + ..FirmwareEntry::default() + }; + assert_eq!(url_count.artifact_count(), 1); + + assert_eq!(FirmwareEntry::default().artifact_count(), 0); + } + + #[test] + fn firmware_file_artifact_deserializes_when_filename_or_url_is_set() { + let cases = [ + ( + r#" +filename = "/opt/nico/firmware/fw.bin" +sha256 = "abc123" +"#, + (Some("/opt/nico/firmware/fw.bin"), None), + ), + ( + r#" +url = "https://firmware.example.invalid/fw.bin" +sha256 = "def456" +"#, + (None, Some("https://firmware.example.invalid/fw.bin")), + ), + ]; + + for (input, (expected_filename, expected_url)) in cases { + let artifact = toml::from_str::(input).unwrap(); + + assert_eq!(artifact.filename.as_deref(), expected_filename); + assert_eq!(artifact.url.as_deref(), expected_url); + } + } + + #[test] + fn firmware_file_artifact_deserialization_requires_filename_or_url() { + let invalid_artifacts = [ + r#" +sha256 = "abc123" +"#, + r#" +filename = "" +sha256 = "abc123" +"#, + r#" +url = " " +sha256 = "abc123" +"#, + r#" +filename = " " +url = "" +sha256 = "abc123" +"#, + ]; + + for input in invalid_artifacts { + let error = toml::from_str::(input).unwrap_err(); + + assert!( + error + .to_string() + .contains("firmware files[] artifact must set filename or url") + ); + } + } + + #[test] + fn firmware_deserialization_rejects_files_artifact_without_location() { + let input = r#" +model = "DGXH100" +vendor = "Nvidia" + +[components.cx7] +current_version_reported_as = "^CX7_[0-9]+$" + +[[components.cx7.known_firmware]] +version = "28.47.2682" + +[[components.cx7.known_firmware.files]] +sha256 = "abc123" +"#; + + let error = toml::from_str::(input).unwrap_err(); + + assert!( + error + .to_string() + .contains("firmware files[] artifact must set filename or url") + ); + } +} diff --git a/crates/firmware/Cargo.toml b/crates/firmware/Cargo.toml index 1fbdc959dc..d5110fc8bb 100644 --- a/crates/firmware/Cargo.toml +++ b/crates/firmware/Cargo.toml @@ -21,14 +21,15 @@ carbide-api-model = { path = "../api-model", default-features = false } eyre = { workspace = true } futures-util = { workspace = true } hex = { workspace = true } -md5 = { workspace = true } regex = { workspace = true, optional = true } reqwest = { workspace = true } serde = { workspace = true } +sha2 = { workspace = true } temp-dir = { workspace = true, optional = true } tokio = { workspace = true } toml = { workspace = true } tracing = { workspace = true } +url = { workspace = true } [dev-dependencies] carbide-test-support = { path = "../test-support" } diff --git a/crates/firmware/src/artifact_cache.rs b/crates/firmware/src/artifact_cache.rs new file mode 100644 index 0000000000..0e45f0a440 --- /dev/null +++ b/crates/firmware/src/artifact_cache.rs @@ -0,0 +1,81 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::path::{Path, PathBuf}; + +use sha2::{Digest, Sha256}; +use url::Url; + +pub fn firmware_cache_filename(firmware_cache_directory: &Path, url: &str) -> Option { + let url_hash = hex::encode(Sha256::digest(url.as_bytes())); + let filename = filename_from_url(url)?; + + Some(firmware_cache_directory.join(url_hash).join(filename)) +} + +fn filename_from_url(raw_url: &str) -> Option { + let url = Url::parse(raw_url).ok()?; + let filename = url.path_segments()?.next_back()?.trim(); + + if filename.is_empty() || filename == "." || filename == ".." { + None + } else { + Some(filename.to_owned()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn firmware_cache_filename_is_under_cache_directory() { + let firmware_cache_directory = Path::new("/mnt/persistence/fw/download-cache"); + let filename = firmware_cache_filename( + firmware_cache_directory, + "https://firmware.example.invalid/path/iDRAC-with-Lifecycle-Controller_Firmware_WN31M_LN64_7.20.60.50_A00.BIN?token=secret", + ) + .unwrap(); + + assert!(filename.starts_with(firmware_cache_directory)); + assert_eq!( + filename + .parent() + .unwrap() + .file_name() + .unwrap() + .to_string_lossy() + .len(), + 64 + ); + assert_eq!( + filename.file_name().unwrap(), + "iDRAC-with-Lifecycle-Controller_Firmware_WN31M_LN64_7.20.60.50_A00.BIN" + ); + } + + #[test] + fn firmware_cache_filename_returns_none_without_filename() { + let firmware_cache_directory = Path::new("/mnt/persistence/fw/download-cache"); + let filename = firmware_cache_filename( + firmware_cache_directory, + "https://firmware.example.invalid/", + ); + + assert_eq!(filename, None); + } +} diff --git a/crates/firmware/src/downloader.rs b/crates/firmware/src/downloader.rs index 09e6e46cc8..a32cfb14f6 100644 --- a/crates/firmware/src/downloader.rs +++ b/crates/firmware/src/downloader.rs @@ -25,6 +25,7 @@ use std::time::Duration; use eyre::{Report, WrapErr, eyre}; use futures_util::StreamExt; use reqwest::Client; +use sha2::{Digest, Sha256}; use tokio::fs::File; #[derive(Clone, Debug)] @@ -57,21 +58,23 @@ impl FirmwareDownloader { /// available will return true if the given file is present, otherwise it will return false after starting a download in the background. /// Anything trying to check the same file while it is downloading will get the exact same result, but will not start a new download. - /// It provides no guarantee that the checksum matches other than on the initial download. - pub fn available(&self, filename: &Path, url: &str, checksum: &str) -> bool { - self.available_actual(filename, url, checksum, None) + /// It verifies the downloaded file against sha256 when a checksum is provided. + pub fn available(&self, filename: &Path, url: &str, sha256: &str) -> bool { + self.available_actual(filename, url, sha256, None) } - // Actual implementation, made visible to unit tests only + // Implementation behind available(). Tests call this directly to control async timing. pub(crate) fn available_actual( &self, filename: &Path, url: &str, - checksum: &str, + sha256: &str, fake_sleep: Option, ) -> bool { - if filename.exists() { - return true; + match cached_file_status(filename, sha256) { + CachedFileStatus::Available => return true, + CachedFileStatus::NeedsDownload => {} + CachedFileStatus::Unusable => return false, } if url.is_empty() { @@ -88,8 +91,10 @@ impl FirmwareDownloader { } // Slight timing hole, recheck for the file - if filename.exists() { - return true; + match cached_file_status(filename, sha256) { + CachedFileStatus::Available => return true, + CachedFileStatus::NeedsDownload => {} + CachedFileStatus::Unusable => return false, } state.downloading.insert(filename_string.clone()); @@ -99,9 +104,9 @@ impl FirmwareDownloader { let filename = filename.to_path_buf(); let url = url.to_owned(); + let sha256 = sha256.to_owned(); let client = state.client.clone().unwrap(); let actual = self.actual.clone(); - let checksum = checksum.to_owned(); tokio::spawn(async move { let dst_filename = format!("{filename_string}.download"); match download(&filename, &url, &dst_filename, client, fake_sleep).await { @@ -115,7 +120,7 @@ impl FirmwareDownloader { } Ok(_) => { tracing::info!("Completed download of {url} to {filename_string}"); - if let Err(e) = verify_checksum(&dst_filename, &checksum) { + if let Err(e) = verify_sha256(&dst_filename, &sha256) { tracing::error!("FirmwareDownloader checksum for {url} failed: {e}"); let _ = std::fs::remove_file(dst_filename); actual @@ -151,6 +156,40 @@ impl FirmwareDownloaderActual { } } +enum CachedFileStatus { + Available, + NeedsDownload, + Unusable, +} + +fn cached_file_status(filename: &Path, sha256: &str) -> CachedFileStatus { + let filename_str = filename.to_string_lossy(); + + if !filename.exists() { + return CachedFileStatus::NeedsDownload; + } + + match verify_sha256(&filename_str, sha256) { + Ok(()) => CachedFileStatus::Available, + Err(err) => { + tracing::warn!( + "Cached firmware artifact {} failed checksum verification: {err}", + filename.display() + ); + + if let Err(err) = std::fs::remove_file(filename) { + tracing::error!( + "Failed to remove stale cached firmware artifact {}: {err}", + filename.display() + ); + return CachedFileStatus::Unusable; + } + + CachedFileStatus::NeedsDownload + } + } +} + async fn download( filename: &Path, url: &String, @@ -160,7 +199,7 @@ async fn download( ) -> Result<(), Report> { // Actual downloader. We aren't able to return errors to callers here, we just print to the log, and will retry on the next request. let dirname = match Path::parent(filename) { - Some(x) => x.to_string_lossy().to_string(), + Some(x) => x, None => { return Err(eyre!( "Could not find dirname of {}", @@ -169,7 +208,8 @@ async fn download( } }; - let _ = std::fs::create_dir_all(dirname); + std::fs::create_dir_all(dirname) + .wrap_err(format!("Unable to create directory {}", dirname.display()))?; let mut dst_file = File::create(&dst_filename) .await .wrap_err(format!("Unable to create file {dst_filename}"))?; @@ -223,23 +263,30 @@ async fn download( Ok(()) } -/// verify_checks checks if the given filename uses the given checksum. This is not meant to be security, +/// Checks if the given filename uses the given checksum. This is not meant to be security, /// it's to check against download corruption or retrieving the wrong thing (such as if the vendor changed the URL). /// We expect the hardware vendor to have done their own signing to ensure that firmware is not compromised. -fn verify_checksum(filename: &String, checksum: &String) -> Result<(), Report> { +fn verify_sha256(filename: &str, checksum: &str) -> Result<(), Report> { + let checksum = checksum.trim().to_ascii_lowercase(); if checksum.is_empty() { - // No validation requested return Ok(()); } - // md5 doesn't support async, must use the standard + let mut file = std::fs::File::open(filename)?; - let mut context = md5::Context::new(); - std::io::copy(&mut file, &mut context)?; + let mut context = Sha256::new(); + let mut buffer = [0; 8192]; + loop { + let read = std::io::Read::read(&mut file, &mut buffer)?; + if read == 0 { + break; + } + context.update(&buffer[..read]); + } - let checksum_actual = hex::encode(*context.finalize()); + let checksum_actual = hex::encode(context.finalize()); - if &checksum_actual != checksum { + if checksum_actual != checksum { return Err(eyre!( "Checksum mismatch: Expected {checksum} downloaded {checksum_actual}" )); diff --git a/crates/firmware/src/lib.rs b/crates/firmware/src/lib.rs index 5ba7262591..39d252b2d6 100644 --- a/crates/firmware/src/lib.rs +++ b/crates/firmware/src/lib.rs @@ -18,6 +18,8 @@ #[cfg(test)] mod tests; +mod artifact_cache; + pub mod config; pub mod defaults; @@ -27,5 +29,6 @@ pub mod downloader; #[cfg(feature = "test-support")] pub mod test_support; +pub use artifact_cache::firmware_cache_filename; pub use config::{FirmwareConfig, FirmwareConfigSnapshot}; pub use downloader::FirmwareDownloader; diff --git a/crates/firmware/src/tests/config.rs b/crates/firmware/src/tests/config.rs index 64d239c33e..d99fe3af43 100644 --- a/crates/firmware/src/tests/config.rs +++ b/crates/firmware/src/tests/config.rs @@ -240,6 +240,10 @@ power_drains_needed = 1 filename = "/opt/carbide/firmware/nvidia-dgxh100-cx7-28.47.2682/cx7.bin" sha256 = "abc123" +[[components.cx7.known_firmware.files]] +url = "https://firmware.example.invalid/cx7.bin" +sha256 = "def456" + [components.cx7.known_firmware.scout] execution_timeout_seconds = 1800 artifact_download_timeout_seconds = 600 @@ -259,7 +263,16 @@ artifact_download_timeout_seconds = 600 let firmware = cx7.known_firmware.first().unwrap(); assert_eq!(firmware.version, "28.47.2682"); assert_eq!(firmware.power_drains_needed, Some(1)); - assert_eq!(firmware.files.len(), 1); + assert_eq!(firmware.files.len(), 2); + assert_eq!( + firmware.files[0].filename.as_deref(), + Some("/opt/carbide/firmware/nvidia-dgxh100-cx7-28.47.2682/cx7.bin") + ); + assert_eq!(firmware.files[1].filename, None); + assert_eq!( + firmware.files[1].url.as_deref(), + Some("https://firmware.example.invalid/cx7.bin") + ); let scout = firmware.scout.as_ref().unwrap(); assert_eq!(scout.execution_timeout_seconds, 1800); diff --git a/crates/firmware/src/tests/downloader.rs b/crates/firmware/src/tests/downloader.rs index 68143b237e..6cf62cf934 100644 --- a/crates/firmware/src/tests/downloader.rs +++ b/crates/firmware/src/tests/downloader.rs @@ -18,6 +18,7 @@ use std::path::Path; use std::time::Duration; +use sha2::Digest; use tokio::fs::File; use tokio::io::AsyncWriteExt; @@ -46,22 +47,23 @@ async fn test_firmware_downloader_repeated() { } #[tokio::test] -async fn test_checksum() -> Result<(), std::io::Error> { - // Test that the checksum validation works - let filename = Path::new("/tmp/test_firmware_checksum"); - let url = "file://tmp/test_firmware_checksum_src".to_string(); +async fn test_download_without_checksum() -> Result<(), std::io::Error> { + let filename = Path::new("/tmp/test_firmware_without_checksum"); + let src_filename = "/tmp/test_firmware_without_checksum_src"; + let url = format!("file://{src_filename}"); - let mut srcfile = File::create("/tmp/test_firmware_checksum_src").await?; + let mut srcfile = File::create(src_filename).await?; for i in 0..2000 { srcfile.write_all(format!("{i}").as_bytes()).await?; } + srcfile.flush().await?; let _ = std::fs::remove_file(filename); let downloader = FirmwareDownloader::new(); let mut count = 0; loop { - if !downloader.available(filename, &url, "a08232ef8a758330f8698442550157f7") { + if !downloader.available(filename, &url, "") { tokio::time::sleep(Duration::from_millis(10)).await; count += 1; if count >= 1000 { @@ -69,8 +71,102 @@ async fn test_checksum() -> Result<(), std::io::Error> { } } else { let _ = std::fs::remove_file(filename); - let _ = std::fs::remove_file("/tmp/test_Firmware_checksum_src"); + let _ = std::fs::remove_file(src_filename); return Ok(()); } } } + +#[tokio::test] +async fn test_available_verifies_sha256_checksum() -> Result<(), std::io::Error> { + let filename = Path::new("/tmp/test_firmware_sha256_checksum"); + let src_filename = "/tmp/test_firmware_sha256_checksum_src"; + let url = format!("file://{src_filename}"); + let contents = b"firmware artifact"; + + let mut srcfile = File::create(src_filename).await?; + srcfile.write_all(contents).await?; + srcfile.flush().await?; + + let _ = std::fs::remove_file(filename); + let downloader = FirmwareDownloader::new(); + let checksum = format!( + " {} ", + hex::encode(sha2::Sha256::digest(contents)).to_ascii_uppercase() + ); + + let mut count = 0; + loop { + if !downloader.available(filename, &url, &checksum) { + tokio::time::sleep(Duration::from_millis(10)).await; + count += 1; + if count >= 1000 { + panic!("Should not have taken this long"); + } + } else { + let _ = std::fs::remove_file(filename); + let _ = std::fs::remove_file(src_filename); + return Ok(()); + } + } +} + +#[tokio::test] +async fn test_available_rejects_stale_cache_with_wrong_sha256() -> Result<(), std::io::Error> { + let filename = Path::new("/tmp/test_firmware_stale_cache_wrong_checksum"); + let src_filename = "/tmp/test_firmware_stale_cache_wrong_checksum_src"; + let url = format!("file://{src_filename}"); + let contents = b"fresh firmware artifact"; + + let mut cached_file = File::create(filename).await?; + cached_file.write_all(b"stale firmware artifact").await?; + cached_file.flush().await?; + drop(cached_file); + + let mut srcfile = File::create(src_filename).await?; + srcfile.write_all(contents).await?; + srcfile.flush().await?; + drop(srcfile); + + let downloader = FirmwareDownloader::new(); + let checksum = hex::encode(sha2::Sha256::digest(contents)); + + assert!(!downloader.available(filename, &url, &checksum)); + + let mut count = 0; + loop { + if !downloader.available(filename, &url, &checksum) { + tokio::time::sleep(Duration::from_millis(10)).await; + count += 1; + if count >= 1000 { + panic!("Should not have taken this long"); + } + } else { + assert_eq!(tokio::fs::read(filename).await?, contents); + let _ = std::fs::remove_file(filename); + let _ = std::fs::remove_file(src_filename); + return Ok(()); + } + } +} + +#[tokio::test] +async fn test_available_checksum_failure_does_not_publish_file() -> Result<(), std::io::Error> { + let filename = Path::new("/tmp/test_firmware_sha256_checksum_failure"); + let src_filename = "/tmp/test_firmware_sha256_checksum_failure_src"; + let url = format!("file://{src_filename}"); + + let mut srcfile = File::create(src_filename).await?; + srcfile.write_all(b"firmware artifact").await?; + srcfile.flush().await?; + + let _ = std::fs::remove_file(filename); + let downloader = FirmwareDownloader::new(); + + assert!(!downloader.available(filename, &url, &"0".repeat(64))); + tokio::time::sleep(Duration::from_millis(500)).await; + + assert!(!filename.exists()); + let _ = std::fs::remove_file(src_filename); + Ok(()) +} diff --git a/crates/machine-controller/src/config/firmware_global.rs b/crates/machine-controller/src/config/firmware_global.rs index edd02a6a6e..aeae9575ce 100644 --- a/crates/machine-controller/src/config/firmware_global.rs +++ b/crates/machine-controller/src/config/firmware_global.rs @@ -62,6 +62,9 @@ pub struct FirmwareGlobal { /// the first doesn't exist. A deployer can pin either explicitly. #[serde(default = "FirmwareGlobal::firmware_directory_default")] pub firmware_directory: PathBuf, + /// Writable directory used to cache downloaded firmware artifacts. + #[serde(default = "FirmwareGlobal::firmware_download_cache_directory_default")] + pub firmware_download_cache_directory: PathBuf, /// Delay before retrying a failed host firmware /// upgrade. /// Default is 60 minutes. @@ -107,6 +110,7 @@ impl FirmwareGlobal { run_interval: Duration::seconds(5), concurrency_limit: FirmwareGlobal::concurrency_limit_default(), firmware_directory: PathBuf::default(), + firmware_download_cache_directory: PathBuf::default(), host_firmware_upgrade_retry_interval: Self::get_retry_interval(), instance_updates_manual_tagging: false, no_reset_retries: false, @@ -144,6 +148,9 @@ impl FirmwareGlobal { } PathBuf::from("/opt/carbide/firmware") } + pub fn firmware_download_cache_directory_default() -> PathBuf { + PathBuf::from("/mnt/persistence/fw/download-cache") + } pub fn host_firmware_upgrade_retry_interval_default() -> Duration { Duration::minutes(60) } @@ -165,6 +172,8 @@ impl Default for FirmwareGlobal { max_uploads: FirmwareGlobal::max_uploads_default(), concurrency_limit: FirmwareGlobal::concurrency_limit_default(), firmware_directory: FirmwareGlobal::firmware_directory_default(), + firmware_download_cache_directory: + FirmwareGlobal::firmware_download_cache_directory_default(), host_firmware_upgrade_retry_interval: FirmwareGlobal::host_firmware_upgrade_retry_interval_default(), instance_updates_manual_tagging: false, diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index d2fa94cbf7..a59e1291a9 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -20,7 +20,6 @@ use std::collections::{HashMap, HashSet}; use std::mem::discriminant as enum_discr; use std::net::IpAddr; -use std::path::{Component, Path}; use std::str::FromStr; use std::sync::{Arc, Mutex}; @@ -106,6 +105,7 @@ use crate::config::{ }; use crate::context::{MachineStateHandlerContextObjects, MachineStateHandlerServices}; use crate::dpf::DpfOperations; +use crate::handler::firmware_artifact::ResolvedFirmwareArtifactSource; use crate::health_report::{ create_host_update_health_report_dpufw, create_host_update_health_report_hostfw, }; @@ -117,6 +117,7 @@ use crate::{MeasuringOutcome, get_measuring_prerequisites, handle_measuring_stat pub mod attestation; mod bios_config; mod dpf; +mod firmware_artifact; mod helpers; mod machine_validation; mod power; @@ -181,37 +182,6 @@ fn scout_firmware_upgrade_deadline( started_at + Duration::seconds(deadline_seconds) } -fn firmware_artifact_url( - pxe_public_base_url: &str, - firmware_dir: &Path, - path: &str, -) -> Result { - let relative = Path::new(path).strip_prefix(firmware_dir).map_err(|_| { - StateHandlerError::GenericError(eyre!( - "firmware artifact path {path} is outside firmware directory {}", - firmware_dir.display() - )) - })?; - - if !relative - .components() - .all(|component| matches!(component, Component::Normal(_))) - { - return Err(StateHandlerError::GenericError(eyre!( - "firmware artifact path {path} contains unsafe path components" - ))); - } - - let relative = relative.to_str().ok_or_else(|| { - StateHandlerError::GenericError(eyre!("firmware artifact path {path} is not valid UTF-8")) - })?; - - Ok(format!( - "{}/public/firmware/{relative}", - pxe_public_base_url.trim_end_matches('/') - )) -} - /// Reachability params to check if DPU is up or not. #[derive(Copy, Clone, Debug)] pub struct ReachabilityParams { @@ -8328,16 +8298,13 @@ impl HostUpgradeState { .artifact_download_timeout_seconds, file_artifacts: to_install .files - .into_iter() - .map(|f| { - Ok(FileArtifact { - url: firmware_artifact_url( - pxe_public_base_url, - firmware_dir, - &f.filename, - )?, - sha256: f.sha256, - }) + .iter() + .map(|artifact| { + firmware_artifact::resolve_scout_file_artifact( + pxe_public_base_url, + firmware_dir, + artifact, + ) }) .collect::, StateHandlerError>>()?, }; @@ -8759,20 +8726,46 @@ impl HostUpgradeState { let snapshot = &state.host_snapshot; let to_install = fw_info.to_install; let component_type = fw_info.component_type; + let artifact = firmware_artifact::resolve_firmware_artifact( + &ctx.services + .site_config + .firmware_global + .firmware_download_cache_directory, + to_install, + *fw_info.firmware_number, + )?; + + match &artifact.source { + ResolvedFirmwareArtifactSource::Remote { url, sha256 } => { + if !self.downloader.available(&artifact.local_path, url, sha256) { + tracing::debug!( + "{} is being downloaded from {}, update deferred", + artifact.local_path.display(), + url + ); - if !self.downloader.available( - &to_install.get_filename(*fw_info.firmware_number), - &to_install.get_url(), - &to_install.get_checksum(), - ) { - tracing::debug!( - "{} is being downloaded from {}, update deferred", - to_install.get_filename(*fw_info.firmware_number).display(), - to_install.get_url() - ); - - return Ok(StateHandlerOutcome::do_nothing()); - } + return Ok(StateHandlerOutcome::do_nothing()); + } + } + ResolvedFirmwareArtifactSource::Local => { + if !artifact.local_path.exists() { + let reason = format!( + "firmware artifact {} for machine {} is not present and no firmware artifact URL is configured", + artifact.local_path.display(), + snapshot.id + ); + tracing::warn!("{reason}"); + return Ok(StateHandlerOutcome::transition(scenario.actual_new_state( + HostReprovisionState::FailedFirmwareUpgrade { + firmware_type: *component_type, + report_time: Some(Utc::now()), + reason: Some(reason), + }, + state.managed_state.get_host_repro_retry_count(), + ))); + } + } + }; let Ok(_active) = self.upload_limiter.try_acquire() else { tracing::debug!( @@ -8826,7 +8819,7 @@ impl HostUpgradeState { } let machine_id = state.host_snapshot.id.to_string(); - let filename = to_install.get_filename(*fw_info.firmware_number); + let filename = artifact.local_path; let redfish_component_type: libredfish::model::update_service::ComponentType = match to_install.install_only_specified { false => libredfish::model::update_service::ComponentType::Unknown, @@ -9033,9 +9026,12 @@ impl HostUpgradeState { component_info.known_firmware.iter().find(|&x| x.default) { let firmware_number = firmware_number.unwrap_or(0) + 1; - if firmware_number - < selected_firmware.filenames.len().try_into().unwrap_or(0) - { + let has_more_artifacts = usize::try_from(firmware_number) + .map(|firmware_number| { + firmware_number < selected_firmware.artifact_count() + }) + .unwrap_or(false); + if has_more_artifacts { tracing::debug!( "Moving {:?} chain step {} on {} to CheckingFirmware", selected_firmware, @@ -11497,49 +11493,6 @@ mod tests { assert_eq!(deadline, started_at + Duration::hours(5)); } - #[test] - fn firmware_artifact_url_uses_path_relative_to_firmware_directory() { - let url = firmware_artifact_url( - "http://carbide-pxe.forge:8080/", - Path::new("/opt/nico/firmware"), - "/opt/nico/firmware/nvidia/dgxh100/cx7/cx7.bin", - ) - .unwrap(); - - assert_eq!( - url, - "http://carbide-pxe.forge:8080/public/firmware/nvidia/dgxh100/cx7/cx7.bin" - ); - } - - #[test] - fn firmware_artifact_url_rejects_unsafe_paths() { - let unsafe_cases = [ - ( - // A sibling directory with the same string prefix is not inside the firmware root. - "/opt/nico/firmware2/nvidia/dgxh100/cx7/cx7.bin", - "is outside firmware directory /opt/nico/firmware", - ), - ( - // A path under the firmware root still cannot traverse back out with `..`. - "/opt/nico/firmware/../cx7.bin", - "contains unsafe path components", - ), - ]; - - for (path, expected_error) in unsafe_cases { - let Err(error) = firmware_artifact_url( - "http://carbide-pxe.forge:8080", - Path::new("/opt/nico/firmware"), - path, - ) else { - panic!("expected unsafe path {path} to be rejected"); - }; - - assert!(error.to_string().contains(expected_error)); - } - } - #[test] fn need_host_fw_upgrade_checks_all_matching_cx7_inventories() { let firmware_type = FirmwareComponentType::Cx7; diff --git a/crates/machine-controller/src/handler/firmware_artifact.rs b/crates/machine-controller/src/handler/firmware_artifact.rs new file mode 100644 index 0000000000..e538db0b44 --- /dev/null +++ b/crates/machine-controller/src/handler/firmware_artifact.rs @@ -0,0 +1,505 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::path::{Component, Path, PathBuf}; + +use carbide_firmware::firmware_cache_filename; +use eyre::eyre; +use model::firmware::{FirmwareEntry, FirmwareFileArtifact}; +use state_controller::state_handler::StateHandlerError; + +use crate::rpc::scout_firmware_upgrade::FileArtifact; + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ResolvedFirmwareArtifact { + pub(crate) local_path: PathBuf, + pub(crate) source: ResolvedFirmwareArtifactSource, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) enum ResolvedFirmwareArtifactSource { + Remote { url: String, sha256: String }, + Local, +} + +pub(crate) fn resolve_firmware_artifact( + firmware_download_cache_directory: &Path, + firmware: &FirmwareEntry, + pos: u32, +) -> Result { + if firmware.files.is_empty() { + return Ok(ResolvedFirmwareArtifact { + local_path: firmware.get_filename(pos), + source: ResolvedFirmwareArtifactSource::Local, + }); + } + + let index = usize::try_from(pos).unwrap_or(usize::MAX); + let artifact = firmware.files.get(index).ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "firmware version {} has no files[] artifact at index {}", + firmware.version, + pos + )) + })?; + + let url = artifact + .url + .as_deref() + .map(str::trim) + .filter(|url| !url.is_empty()); + + let filename = artifact + .filename + .as_deref() + .map(str::trim) + .filter(|filename| !filename.is_empty()); + + let local_path = if let Some(url) = url { + firmware_cache_filename(firmware_download_cache_directory, url).ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "firmware version {} files[] artifact at index {} URL does not include a filename", + firmware.version, + pos + )) + })? + } else if let Some(filename) = filename { + PathBuf::from(filename) + } else { + return Err(StateHandlerError::GenericError(eyre!( + "firmware version {} files[] artifact at index {} has no filename or URL", + firmware.version, + pos + ))); + }; + + let source = match url { + Some(url) => ResolvedFirmwareArtifactSource::Remote { + url: url.to_owned(), + sha256: artifact.sha256.clone(), + }, + None => ResolvedFirmwareArtifactSource::Local, + }; + + Ok(ResolvedFirmwareArtifact { local_path, source }) +} + +pub(crate) fn resolve_scout_file_artifact( + pxe_public_base_url: &str, + firmware_directory: &Path, + artifact: &FirmwareFileArtifact, +) -> Result { + let url = artifact + .url + .as_deref() + .map(str::trim) + .filter(|url| !url.is_empty()); + + let filename = artifact + .filename + .as_deref() + .map(str::trim) + .filter(|filename| !filename.is_empty()); + + let url = if let Some(url) = url { + url.to_owned() + } else if let Some(filename) = filename { + firmware_artifact_url(pxe_public_base_url, firmware_directory, filename)? + } else { + return Err(StateHandlerError::GenericError(eyre!( + "scout firmware artifact has no filename or URL" + ))); + }; + + Ok(FileArtifact { + url, + sha256: artifact.sha256.clone(), + }) +} + +fn firmware_artifact_url( + pxe_public_base_url: &str, + firmware_directory: &Path, + path: &str, +) -> Result { + let relative = Path::new(path) + .strip_prefix(firmware_directory) + .map_err(|_| { + StateHandlerError::GenericError(eyre!( + "firmware artifact path {path} is outside firmware directory {}", + firmware_directory.display() + )) + })?; + + if !relative + .components() + .all(|component| matches!(component, Component::Normal(_))) + { + return Err(StateHandlerError::GenericError(eyre!( + "firmware artifact path {path} contains unsafe path components" + ))); + } + + let relative = relative.to_str().ok_or_else(|| { + StateHandlerError::GenericError(eyre!("firmware artifact path {path} is not valid UTF-8")) + })?; + + Ok(format!( + "{}/public/firmware/{relative}", + pxe_public_base_url.trim_end_matches('/') + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + const FIRMWARE_DOWNLOAD_CACHE_DIRECTORY: &str = "/mnt/persistence/fw/download-cache"; + + #[test] + fn resolves_files_artifact_with_url_and_filename() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/fw.bin".to_string()), + url: Some("https://firmware.example.invalid/fw.bin".to_string()), + sha256: "abc123".to_string(), + }]); + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap(); + + assert!( + artifact + .local_path + .starts_with(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY) + ); + assert_eq!(artifact.local_path.file_name().unwrap(), "fw.bin"); + assert_eq!( + artifact.source, + ResolvedFirmwareArtifactSource::Remote { + url: "https://firmware.example.invalid/fw.bin".to_string(), + sha256: "abc123".to_string(), + } + ); + } + + #[test] + fn derives_files_artifact_filename_from_url_when_filename_is_missing() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: None, + url: Some("https://firmware.example.invalid/path/fw_image.bin".to_string()), + sha256: "abc123".to_string(), + }]); + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap(); + + assert!( + artifact + .local_path + .starts_with(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY) + ); + assert_eq!(artifact.local_path.file_name().unwrap(), "fw_image.bin"); + assert_eq!( + artifact + .local_path + .parent() + .unwrap() + .file_name() + .unwrap() + .to_string_lossy() + .len(), + 64 + ); + } + + #[test] + fn resolves_files_artifact_by_index() { + let firmware = firmware_with_files(vec![ + FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/first.bin".to_string()), + url: Some("https://firmware.example.invalid/first.bin".to_string()), + sha256: "first-sha".to_string(), + }, + FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/second.bin".to_string()), + url: Some("https://firmware.example.invalid/second.bin".to_string()), + sha256: "second-sha".to_string(), + }, + ]); + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 1) + .unwrap(); + + assert!( + artifact + .local_path + .starts_with(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY) + ); + assert_eq!(artifact.local_path.file_name().unwrap(), "second.bin"); + assert_eq!( + artifact.source, + ResolvedFirmwareArtifactSource::Remote { + url: "https://firmware.example.invalid/second.bin".to_string(), + sha256: "second-sha".to_string(), + } + ); + } + + #[test] + fn resolves_files_artifact_without_url_as_local_file() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/fw.bin".to_string()), + url: None, + sha256: "abc123".to_string(), + }]); + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap(); + + assert_eq!( + artifact.local_path, + PathBuf::from("/opt/nico/firmware/fw.bin") + ); + assert_eq!(artifact.source, ResolvedFirmwareArtifactSource::Local); + } + + #[test] + fn files_artifact_index_out_of_range_is_an_error() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/fw.bin".to_string()), + url: None, + sha256: "abc123".to_string(), + }]); + + let error = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 1) + .unwrap_err(); + + assert!( + error + .to_string() + .contains("has no files[] artifact at index 1") + ); + } + + #[test] + fn files_artifact_without_filename_or_url_is_an_error() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: None, + url: None, + sha256: "abc123".to_string(), + }]); + + let error = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap_err(); + + assert!(error.to_string().contains("has no filename or URL")); + } + + #[test] + fn files_artifact_url_without_filename_is_an_error() { + let firmware = firmware_with_files(vec![FirmwareFileArtifact { + filename: None, + url: Some("https://firmware.example.invalid/".to_string()), + sha256: "abc123".to_string(), + }]); + + let error = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap_err(); + + assert!( + error + .to_string() + .contains("URL does not include a filename") + ); + } + + #[test] + fn legacy_firmware_ignores_top_level_url_and_resolves_as_local_source() { + let firmware = FirmwareEntry { + version: "1.0".to_string(), + filename: None, + filenames: vec![ + "/opt/nico/firmware/first.bin".to_string(), + "/opt/nico/firmware/second.bin".to_string(), + ], + url: Some("https://firmware.example.invalid/legacy.bin".to_string()), + checksum: Some("legacy-sha".to_string()), + ..FirmwareEntry::default() + }; + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 1) + .unwrap(); + + assert_eq!( + artifact.local_path, + PathBuf::from("/opt/nico/firmware/second.bin") + ); + assert_eq!(artifact.source, ResolvedFirmwareArtifactSource::Local); + } + + #[test] + fn legacy_firmware_without_url_resolves_as_local_source() { + let firmware = FirmwareEntry { + version: "1.0".to_string(), + filename: Some("/opt/nico/firmware/fw.bin".to_string()), + url: None, + checksum: None, + ..FirmwareEntry::default() + }; + + let artifact = + resolve_firmware_artifact(Path::new(FIRMWARE_DOWNLOAD_CACHE_DIRECTORY), &firmware, 0) + .unwrap(); + + assert_eq!( + artifact.local_path, + PathBuf::from("/opt/nico/firmware/fw.bin") + ); + assert_eq!(artifact.source, ResolvedFirmwareArtifactSource::Local); + } + + #[test] + fn resolve_scout_file_artifact_uses_direct_url_when_url_and_filename_are_set() { + let artifact = FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/nvidia/fw.bin".to_string()), + url: Some("https://firmware.example.invalid/fw.bin".to_string()), + sha256: "abc123".to_string(), + }; + + let file_artifact = resolve_scout_file_artifact( + "http://carbide-pxe.forge:8080", + Path::new("/opt/nico/firmware"), + &artifact, + ) + .expect("artifact should resolve"); + + assert_eq!(file_artifact.url, "https://firmware.example.invalid/fw.bin"); + assert_eq!(file_artifact.sha256, "abc123"); + } + + #[test] + fn resolve_scout_file_artifact_uses_pxe_url_for_filename_without_url() { + let artifact = FirmwareFileArtifact { + filename: Some("/opt/nico/firmware/nvidia/fw.bin".to_string()), + url: None, + sha256: "abc123".to_string(), + }; + + let file_artifact = resolve_scout_file_artifact( + "http://carbide-pxe.forge:8080/", + Path::new("/opt/nico/firmware"), + &artifact, + ) + .expect("artifact should resolve"); + + assert_eq!( + file_artifact.url, + "http://carbide-pxe.forge:8080/public/firmware/nvidia/fw.bin" + ); + assert_eq!(file_artifact.sha256, "abc123"); + } + + #[test] + fn resolve_scout_file_artifact_uses_direct_url_without_filename() { + let artifact = FirmwareFileArtifact { + filename: None, + url: Some("https://firmware.example.invalid/fw.bin".to_string()), + sha256: "abc123".to_string(), + }; + + let file_artifact = resolve_scout_file_artifact( + "http://carbide-pxe.forge:8080", + Path::new("/opt/nico/firmware"), + &artifact, + ) + .expect("artifact should resolve"); + + assert_eq!(file_artifact.url, "https://firmware.example.invalid/fw.bin"); + assert_eq!(file_artifact.sha256, "abc123"); + } + + #[test] + fn resolve_scout_file_artifact_requires_filename_or_url() { + let artifact = FirmwareFileArtifact { + filename: None, + url: None, + sha256: "abc123".to_string(), + }; + + let error = resolve_scout_file_artifact( + "http://carbide-pxe.forge:8080", + Path::new("/opt/nico/firmware"), + &artifact, + ) + .unwrap_err(); + + assert!( + error + .to_string() + .contains("scout firmware artifact has no filename or URL") + ); + } + + #[test] + fn resolve_scout_file_artifact_rejects_unsafe_filename_paths() { + let unsafe_cases = [ + ( + // A sibling directory with the same string prefix is not inside the firmware root. + "/opt/nico/firmware2/nvidia/dgxh100/cx7/cx7.bin", + "is outside firmware directory /opt/nico/firmware", + ), + ( + // A path under the firmware root still cannot traverse back out with `..`. + "/opt/nico/firmware/../cx7.bin", + "contains unsafe path components", + ), + ]; + + for (filename, expected_error) in unsafe_cases { + let artifact = FirmwareFileArtifact { + filename: Some(filename.to_string()), + url: None, + sha256: "abc123".to_string(), + }; + + let error = resolve_scout_file_artifact( + "http://carbide-pxe.forge:8080", + Path::new("/opt/nico/firmware"), + &artifact, + ) + .unwrap_err(); + + assert!(error.to_string().contains(expected_error)); + } + } + + fn firmware_with_files(files: Vec) -> FirmwareEntry { + FirmwareEntry { + version: "1.0".to_string(), + files, + ..FirmwareEntry::default() + } + } +} diff --git a/crates/machine-controller/src/scout_firmware_scripts.rs b/crates/machine-controller/src/scout_firmware_scripts.rs index ae651a6c91..b94ef3ecdf 100644 --- a/crates/machine-controller/src/scout_firmware_scripts.rs +++ b/crates/machine-controller/src/scout_firmware_scripts.rs @@ -190,7 +190,13 @@ impl ScoutFirmwareScriptSelector { /// This keeps vendor/model strings case-insensitive while preventing path /// traversal or accidental nested directories from host inventory data. fn safe_lookup_key(value: &str) -> Option { - let value = value.trim().to_ascii_lowercase(); + let value = value + .trim() + .to_ascii_lowercase() + .split_ascii_whitespace() + .collect::>() + .join("_"); + if is_safe_path_segment(&value) { Some(value) } else { @@ -200,9 +206,9 @@ fn safe_lookup_key(value: &str) -> Option { /// Checks whether a value is safe to use as one filesystem path segment. /// -/// "dgx h100" -> rejected -/// "../dgxh100" -> rejected /// "dgxh100" -> allowed +/// "poweredge_r760" -> allowed +/// "../dgxh100" -> rejected fn is_safe_path_segment(value: &str) -> bool { !value.is_empty() && value @@ -385,6 +391,37 @@ mod tests { assert_eq!(script.artifact_download_timeout_seconds, 20); } + #[test] + fn normalizes_model_name_spaces_to_underscores() { + let tempdir = tempfile::tempdir().unwrap(); + let component_dir = tempdir.path().join("dell/poweredge_r760/bmc"); + fs::create_dir_all(&component_dir).unwrap(); + fs::write(component_dir.join(SCOUT_FIRMWARE_SCRIPT_FILE), "echo ok\n").unwrap(); + fs::write( + component_dir.join(SCOUT_FIRMWARE_METADATA_FILE), + "execution_timeout_seconds = 10\nartifact_download_timeout_seconds = 20\n", + ) + .unwrap(); + + let script = find_scout_script_in( + tempdir.path(), + TEST_PXE_PUBLIC_BASE_URL, + bmc_vendor::BMCVendor::Dell, + "PowerEdge R760", + FirmwareComponentType::Bmc, + ) + .unwrap() + .expect("script should resolve"); + + assert_eq!( + script.url, + "http://carbide-pxe.forge:8080/public/scout-firmware-scripts/dell/poweredge_r760/bmc/upgrade.sh" + ); + assert_eq!(script.sha256, hex::encode(Sha256::digest(b"echo ok\n"))); + assert_eq!(script.execution_timeout_seconds, 10); + assert_eq!(script.artifact_download_timeout_seconds, 20); + } + #[test] fn errors_when_script_or_metadata_pair_is_incomplete() { let tempdir = tempfile::tempdir().unwrap(); diff --git a/docs/observability/core_metrics.md b/docs/observability/core_metrics.md index a6a36ac2ad..0c3a2dfe8b 100644 --- a/docs/observability/core_metrics.md +++ b/docs/observability/core_metrics.md @@ -123,6 +123,7 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). carbide_site_explorer_created_power_shelves_countgaugeThe amount of Power Shelves that had been created by Site Explorer after being identified carbide_site_explorer_enabledgaugeWhether site-explorer is enabled (1) or paused (0) carbide_site_explorer_iteration_latency_millisecondshistogramThe time it took to perform one site explorer iteration +carbide_site_explorer_last_run_statusgaugeThe status of the latest Site Explorer run carbide_site_explorer_phase_latency_millisecondshistogramThe time it took to perform one site explorer iteration phase carbide_site_explorer_update_explored_endpoints_countgaugeCounts from the last update_explored_endpoints phase by kind carbide_switches_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_switches in the system