From e9f2f826bb995701084bc19fd58476404a44edc3 Mon Sep 17 00:00:00 2001 From: Erez Kirson <71442666+kirson-git@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:29:11 +0300 Subject: [PATCH 1/2] fix(agent): default min_dpu_functioning_links to 1 (don't fence DPU on one dead uplink) --- crates/agent/src/main_loop.rs | 58 +++++++---------------------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/crates/agent/src/main_loop.rs b/crates/agent/src/main_loop.rs index 16709adddf..4366872f23 100644 --- a/crates/agent/src/main_loop.rs +++ b/crates/agent/src/main_loop.rs @@ -60,9 +60,9 @@ use crate::network_monitor::{self, NetworkPingerType}; use crate::util::get_host_boot_timestamp; use crate::{ FMDS_MINIMUM_HBN_VERSION, HBNDeviceNames, NVUE_MINIMUM_HBN_VERSION, RunOptions, command_line, - ethernet_virtualization, extension_services, get_non_empty_str, hbn, health, - instance_metadata_endpoint, lldp, machine_inventory_updater, managed_files, mtu, netlink, nvue, - periodic_config_fetcher, pretty_cmd, sysfs, upgrade, + ethernet_virtualization, extension_services, hbn, health, instance_metadata_endpoint, lldp, + machine_inventory_updater, managed_files, mtu, netlink, nvue, periodic_config_fetcher, + pretty_cmd, sysfs, upgrade, }; // Main loop when running in daemon mode @@ -379,7 +379,6 @@ pub async fn setup_and_run( extension_service_manager, nvue_context, dhcp_interface_translation_mode, - current_network_version: CurrentNetworkVersion::default(), }; main_loop.run().await @@ -413,7 +412,6 @@ struct MainLoop { extension_service_manager: extension_services::ExtensionServiceManager, nvue_context: Option, dhcp_interface_translation_mode: Option, - current_network_version: CurrentNetworkVersion, } struct IterationResult { @@ -421,38 +419,6 @@ struct IterationResult { loop_period: std::time::Duration, } -/// `CurrentNetworkVersion` tracks the versions we last successfully applied, -/// mostly so we can avoid hitting the HBN update methods more frequently than -/// needed. -#[derive(Debug, Default)] -struct CurrentNetworkVersion { - managed_host_config_version: Option, - instance_network_config_version: Option, -} - -impl CurrentNetworkVersion { - pub fn matches_versions_from( - &self, - conf: impl AsRef, - ) -> bool { - let conf = conf.as_ref(); - let managed_host_config_version = get_non_empty_str(&conf.managed_host_config_version); - let instance_network_config_version = - get_non_empty_str(&conf.instance_network_config_version); - - self.managed_host_config_version.as_deref() == managed_host_config_version - && self.instance_network_config_version.as_deref() == instance_network_config_version - } - - pub fn update_from(&mut self, conf: impl AsRef) { - let conf = conf.as_ref(); - self.managed_host_config_version = - get_non_empty_str(&conf.managed_host_config_version).map(String::from); - self.instance_network_config_version = - get_non_empty_str(&conf.instance_network_config_version).map(String::from); - } -} - /// Returns the last DHCP request timestamps for all known host interfaces. /// /// When `dhcp_grpc_server` is `Some`, fetches timestamps from the dhcp-server @@ -656,14 +622,7 @@ impl MainLoop { ) .await; - let update_result = if self.current_network_version.matches_versions_from(&conf) - { - tracing::debug!( - "No configuration change, skipping HBN updates: {:?}", - &self.current_network_version - ); - Ok(false) - } else { + let update_result = { if self.options.agent_platform_type.is_dpu_os() && hbn_version >= self.fmds_minimum_hbn_version { @@ -753,7 +712,6 @@ impl MainLoop { }; match joined_result { Ok(has_changed) => { - self.current_network_version.update_from(&conf); has_changed_configs = has_changed; if self.options.agent_platform_type.is_dpu_os() && let Err(err) = mtu::ensure().await @@ -877,7 +835,13 @@ impl MainLoop { hbn_root: &self.agent_config.hbn.root_dir, host_routes: &tenant_peers, has_changed_configs, - min_healthy_links: conf.min_dpu_functioning_links.unwrap_or(2), + // Default to 1: a dual-homed DPU stays usable on a single uplink, so + // losing one redundant link degrades gracefully instead of fencing the + // DPU. Requiring all links by default defeats uplink failover/HA — a + // single dead redundant uplink would block instance provisioning + // (DPU held unhealthy at Assigned/WaitingForNetworkConfig). Operators + // can set min_dpu_functioning_links higher for stricter health. + min_healthy_links: conf.min_dpu_functioning_links.unwrap_or(1), route_servers: &conf.route_servers, hbn_device_names: self.hbn_device_names.clone(), include_dhcp_server: !conf.use_admin_network || conf.is_primary_dpu, From a28af597fcf66b787c644a5b52e69b1ccb1308b3 Mon Sep 17 00:00:00 2001 From: Erez Kirson <71442666+kirson-git@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:36:31 +0300 Subject: [PATCH 2/2] fix(agent): default min_dpu_functioning_links to 1 (do not fence DPU when one redundant uplink is down) --- crates/agent/src/main_loop.rs | 56 ++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/crates/agent/src/main_loop.rs b/crates/agent/src/main_loop.rs index 4366872f23..d611e7c203 100644 --- a/crates/agent/src/main_loop.rs +++ b/crates/agent/src/main_loop.rs @@ -60,9 +60,9 @@ use crate::network_monitor::{self, NetworkPingerType}; use crate::util::get_host_boot_timestamp; use crate::{ FMDS_MINIMUM_HBN_VERSION, HBNDeviceNames, NVUE_MINIMUM_HBN_VERSION, RunOptions, command_line, - ethernet_virtualization, extension_services, hbn, health, instance_metadata_endpoint, lldp, - machine_inventory_updater, managed_files, mtu, netlink, nvue, periodic_config_fetcher, - pretty_cmd, sysfs, upgrade, + ethernet_virtualization, extension_services, get_non_empty_str, hbn, health, + instance_metadata_endpoint, lldp, machine_inventory_updater, managed_files, mtu, netlink, nvue, + periodic_config_fetcher, pretty_cmd, sysfs, upgrade, }; // Main loop when running in daemon mode @@ -379,6 +379,7 @@ pub async fn setup_and_run( extension_service_manager, nvue_context, dhcp_interface_translation_mode, + current_network_version: CurrentNetworkVersion::default(), }; main_loop.run().await @@ -412,6 +413,7 @@ struct MainLoop { extension_service_manager: extension_services::ExtensionServiceManager, nvue_context: Option, dhcp_interface_translation_mode: Option, + current_network_version: CurrentNetworkVersion, } struct IterationResult { @@ -419,6 +421,38 @@ struct IterationResult { loop_period: std::time::Duration, } +/// `CurrentNetworkVersion` tracks the versions we last successfully applied, +/// mostly so we can avoid hitting the HBN update methods more frequently than +/// needed. +#[derive(Debug, Default)] +struct CurrentNetworkVersion { + managed_host_config_version: Option, + instance_network_config_version: Option, +} + +impl CurrentNetworkVersion { + pub fn matches_versions_from( + &self, + conf: impl AsRef, + ) -> bool { + let conf = conf.as_ref(); + let managed_host_config_version = get_non_empty_str(&conf.managed_host_config_version); + let instance_network_config_version = + get_non_empty_str(&conf.instance_network_config_version); + + self.managed_host_config_version.as_deref() == managed_host_config_version + && self.instance_network_config_version.as_deref() == instance_network_config_version + } + + pub fn update_from(&mut self, conf: impl AsRef) { + let conf = conf.as_ref(); + self.managed_host_config_version = + get_non_empty_str(&conf.managed_host_config_version).map(String::from); + self.instance_network_config_version = + get_non_empty_str(&conf.instance_network_config_version).map(String::from); + } +} + /// Returns the last DHCP request timestamps for all known host interfaces. /// /// When `dhcp_grpc_server` is `Some`, fetches timestamps from the dhcp-server @@ -622,7 +656,14 @@ impl MainLoop { ) .await; - let update_result = { + let update_result = if self.current_network_version.matches_versions_from(&conf) + { + tracing::debug!( + "No configuration change, skipping HBN updates: {:?}", + &self.current_network_version + ); + Ok(false) + } else { if self.options.agent_platform_type.is_dpu_os() && hbn_version >= self.fmds_minimum_hbn_version { @@ -712,6 +753,7 @@ impl MainLoop { }; match joined_result { Ok(has_changed) => { + self.current_network_version.update_from(&conf); has_changed_configs = has_changed; if self.options.agent_platform_type.is_dpu_os() && let Err(err) = mtu::ensure().await @@ -835,12 +877,6 @@ impl MainLoop { hbn_root: &self.agent_config.hbn.root_dir, host_routes: &tenant_peers, has_changed_configs, - // Default to 1: a dual-homed DPU stays usable on a single uplink, so - // losing one redundant link degrades gracefully instead of fencing the - // DPU. Requiring all links by default defeats uplink failover/HA — a - // single dead redundant uplink would block instance provisioning - // (DPU held unhealthy at Assigned/WaitingForNetworkConfig). Operators - // can set min_dpu_functioning_links higher for stricter health. min_healthy_links: conf.min_dpu_functioning_links.unwrap_or(1), route_servers: &conf.route_servers, hbn_device_names: self.hbn_device_names.clone(),