diff --git a/crates/machine-a-tron/src/mock_ssh_server.rs b/crates/machine-a-tron/src/mock_ssh_server.rs index 1c6f422940..e33c9b5f72 100644 --- a/crates/machine-a-tron/src/mock_ssh_server.rs +++ b/crates/machine-a-tron/src/mock_ssh_server.rs @@ -45,6 +45,7 @@ pub struct Credentials { pub enum PromptBehavior { Dell, Dpu, + LenovoSr650, } pub async fn spawn( @@ -202,9 +203,10 @@ impl MockSshHandler { format!("\r\nroot@{} # ", self.prompt_hostname.get_hostname()), )?; } - ConsoleState::Bmc => { - session.data(channel, "\nracadm>>")?; - } + ConsoleState::Bmc => match self.prompt_behavior { + PromptBehavior::LenovoSr650 => session.data(channel, "\nsystem>")?, + _ => session.data(channel, "\nracadm>>")?, + }, ConsoleState::NoShell => { // Do nothing } @@ -256,7 +258,7 @@ impl server::Handler for MockSshHandler { ) -> StdResult<(), Self::Error> { tracing::debug!("shell_request"); match self.prompt_behavior { - PromptBehavior::Dell => { + PromptBehavior::Dell | PromptBehavior::LenovoSr650 => { self.console_state = ConsoleState::Bmc; } PromptBehavior::Dpu => { @@ -314,11 +316,33 @@ impl server::Handler for MockSshHandler { ConsoleState::Bmc => { if data == b"\n" || data == b"\r\n" || data == b"\r" { let command = std::mem::take(&mut self.buffer); - if command.starts_with(b"connect com2") { - tracing::info!( - "Got `connect com2` in bmc propmt, simulating system console" - ); - self.console_state = ConsoleState::SystemConsole; + match self.prompt_behavior { + PromptBehavior::Dell if command.starts_with(b"connect com2") => { + tracing::info!( + "Got `connect com2` in bmc prompt, simulating system console" + ); + self.console_state = ConsoleState::SystemConsole; + } + PromptBehavior::LenovoSr650 if command.starts_with(b"console kill 1") => { + tracing::info!( + "Got unsupported Lenovo `console kill 1`, simulating BMC error" + ); + session.data( + channel, + "\r\nThe command line contains extraneous arguments\r\n", + )?; + } + PromptBehavior::LenovoSr650 if command.starts_with(b"console kill") => { + tracing::info!( + "Got Lenovo `console kill`, simulating terminated SOL session" + ); + session.data(channel, "\r\nSession on channel 1 is terminated\r\n")?; + } + PromptBehavior::LenovoSr650 if command.starts_with(b"console start") => { + tracing::info!("Got Lenovo `console start`, simulating system console"); + self.console_state = ConsoleState::SystemConsole; + } + _ => {} } self.print_prompt(session, channel)?; } else { diff --git a/crates/ssh-console/src/bmc/connection_impl/ssh.rs b/crates/ssh-console/src/bmc/connection_impl/ssh.rs index f574f8d2cc..599c08a0c6 100644 --- a/crates/ssh-console/src/bmc/connection_impl/ssh.rs +++ b/crates/ssh-console/src/bmc/connection_impl/ssh.rs @@ -392,8 +392,8 @@ async fn make_authenticated_client( } // Interact with the serial-on-lan console within the BMC ssh session, calling the vendor's serial -// activation command (`connect com1`, etc) and ensuring we're in the serial console before -// continuing. +// activation command (`connect com1`, etc), falling back when needed, and ensuring we're in the +// serial console before continuing. async fn trigger_and_await_sol_console( machine_id: MachineId, ssh_client_channel: &mut Channel, @@ -439,12 +439,15 @@ async fn trigger_and_await_sol_console( })?; let mut prompt_buf: Vec = Vec::with_capacity(1024); - let timeout = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + let mut timeout = tokio::time::Instant::now() + std::time::Duration::from_secs(30); // After sending the activate command, wait for this much data to be read back (the command // itself echoing back, plus the prompt length) before continuing. (If we let the client use the // console before this, we get false positives about seeing a bmc prompt while we're supposed to // be in the console.) - let skip_data_read_len = bmc_prompt.len() + activate_command.len(); + let mut skip_data_read_len = bmc_prompt.len() + activate_command.len(); + let mut fallback_activate_sent = false; + let mut fallback_activate_commands: Option<&'static [&'static [u8]]> = None; + let mut next_fallback_command_index = 0; let mut activation_step = SerialConsoleActivationStep::WaitingForBmcPrompt; loop { @@ -467,14 +470,12 @@ async fn trigger_and_await_sol_console( // We saw the prompt, send the serial activate command (`connect com1`, // etc) one byte at a time: This seems to work better with some // consoles. - for byte in activate_command { - ssh_client_channel - .data([*byte].as_slice()) - .await - .map_err(|error| ConsoleActivateError::Request { phase: "sending serial activate command to BMC", error })?; - } - ssh_client_channel.data(b"\n".as_slice()).await - .map_err(|error| ConsoleActivateError::Request { phase: "sending data to BMC", error })?; + send_command_bytewise( + ssh_client_channel, + activate_command, + "sending serial activate command to BMC", + ) + .await?; activation_step = SerialConsoleActivationStep::ActivateSent; // Clear the prompt prompt_buf.clear(); @@ -486,7 +487,77 @@ async fn trigger_and_await_sol_console( // get false positives about seeing a bmc prompt while we're supposed to be // in the console.) if matches!(activation_step, SerialConsoleActivationStep::ActivateSent) - && prompt_buf.len() > skip_data_read_len { + && let Some(fallback_commands) = bmc_vendor + .fallback_serial_activate_commands_if_needed( + &prompt_buf, + fallback_activate_sent, + ) + { + tracing::info!( + %machine_id, + "Primary SOL activation failed, trying fallback" + ); + fallback_activate_sent = true; + fallback_activate_commands = Some(fallback_commands); + next_fallback_command_index = 0; + let fallback_command = fallback_commands[next_fallback_command_index]; + next_fallback_command_index += 1; + skip_data_read_len = bmc_prompt.len() + fallback_command.len(); + timeout = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + send_command_bytewise( + ssh_client_channel, + fallback_command, + "sending fallback serial activate command to BMC", + ) + .await?; + prompt_buf.clear(); + } + + if matches!(activation_step, SerialConsoleActivationStep::ActivateSent) + && let Some(fallback_commands) = fallback_activate_commands + && next_fallback_command_index < fallback_commands.len() + && prompt_buf.len() > skip_data_read_len + && prompt_buf.windows(bmc_prompt.len()).any(|window| window == bmc_prompt) + { + let fallback_command = fallback_commands[next_fallback_command_index]; + next_fallback_command_index += 1; + skip_data_read_len = bmc_prompt.len() + fallback_command.len(); + timeout = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + send_command_bytewise( + ssh_client_channel, + fallback_command, + "sending fallback serial activate command to BMC", + ) + .await?; + prompt_buf.clear(); + } + + let waiting_for_fallback_prompt = fallback_activate_commands + .is_some_and(|commands| next_fallback_command_index < commands.len()); + let fallback_sequence_complete = fallback_activate_commands + .is_some_and(|commands| next_fallback_command_index == commands.len()); + let activation_output = if fallback_sequence_complete + && let Some(fallback_commands) = fallback_activate_commands + { + let final_fallback_command = fallback_commands[fallback_commands.len() - 1]; + prompt_buf + .windows(final_fallback_command.len()) + .rposition(|window| window == final_fallback_command) + .map(|command_offset| &prompt_buf[command_offset..]) + } else { + Some(prompt_buf.as_slice()) + }; + if matches!(activation_step, SerialConsoleActivationStep::ActivateSent) + && !waiting_for_fallback_prompt + && let Some(activation_output) = activation_output + && !(fallback_sequence_complete + && activation_output + .windows(bmc_prompt.len()) + .any(|window| window == bmc_prompt)) + && bmc_vendor.should_accept_sol_activation_output( + activation_output, + skip_data_read_len, + ) { tracing::debug!(%machine_id, "confirmed serial activate command sent, letting client use console"); break; } @@ -537,6 +608,27 @@ enum SerialConsoleActivationStep { ActivateSent, } +async fn send_command_bytewise( + ssh_client_channel: &mut Channel, + command: &[u8], + phase: &'static str, +) -> Result<(), ConsoleActivateError> { + for byte in command { + ssh_client_channel + .data([*byte].as_slice()) + .await + .map_err(|error| ConsoleActivateError::Request { phase, error })?; + } + ssh_client_channel + .data(b"\n".as_slice()) + .await + .map_err(|error| ConsoleActivateError::Request { + phase: "sending data to BMC", + error, + })?; + Ok(()) +} + /// Returns `true` if `buf` contains the byte sequence `pat` anywhere /// (contiguously), running in O(n*m) time (n = buf.len(), m = pat.len()) /// and doing no heap allocations. diff --git a/crates/ssh-console/src/bmc/vendor.rs b/crates/ssh-console/src/bmc/vendor.rs index 791c63d117..df0927f89a 100644 --- a/crates/ssh-console/src/bmc/vendor.rs +++ b/crates/ssh-console/src/bmc/vendor.rs @@ -24,6 +24,8 @@ use serde::{Deserialize, Deserializer, Serialize}; /// The escape sequence for IPMI is vendor-independent since it's specific to ipmitool. pub static IPMITOOL_ESCAPE_SEQUENCE: EscapeSequence = EscapeSequence::Pair((b'~', &[b'.', b'B', b'?', 0x1a, 0x18])); +const LENOVO_SOL_PRIMARY_FAILURE: &[u8] = b"The command line contains extraneous arguments"; +const LENOVO_SOL_FALLBACK_ACTIVATE_COMMANDS: &[&[u8]] = &[b"console kill", b"console start"]; #[derive(Copy, Clone, Debug, PartialEq)] pub enum BmcVendor { @@ -171,6 +173,35 @@ impl SshBmcVendor { } } + pub fn fallback_serial_activate_commands_if_needed( + &self, + prompt_buf: &[u8], + fallback_sent: bool, + ) -> Option<&'static [&'static [u8]]> { + match self { + SshBmcVendor::Lenovo + if !fallback_sent + && bytes_contains(prompt_buf, LENOVO_SOL_PRIMARY_FAILURE) + && self + .bmc_prompt() + .is_some_and(|prompt| bytes_contains(prompt_buf, prompt)) => + { + Some(LENOVO_SOL_FALLBACK_ACTIVATE_COMMANDS) + } + _ => None, + } + } + + pub fn should_accept_sol_activation_output( + &self, + prompt_buf: &[u8], + skip_data_read_len: usize, + ) -> bool { + let lenovo_failure_pending = matches!(self, SshBmcVendor::Lenovo) + && bytes_contains(prompt_buf, LENOVO_SOL_PRIMARY_FAILURE); + !lenovo_failure_pending && prompt_buf.len() > skip_data_read_len + } + pub fn filter_escape_sequences<'a>( &self, input: &'a [u8], @@ -202,6 +233,10 @@ impl SshBmcVendor { } } +fn bytes_contains(buf: &[u8], pat: &[u8]) -> bool { + !pat.is_empty() && buf.windows(pat.len()).any(|window| window == pat) +} + #[derive(Clone, Copy, PartialEq)] pub enum EscapeSequence { // A single one-byte escape (ie. ctrl+\) @@ -332,6 +367,18 @@ mod tests { prev_pending: bool, } + struct FallbackCase { + vendor: SshBmcVendor, + output: &'static [u8], + fallback_sent: bool, + } + + struct AcceptActivationCase { + vendor: SshBmcVendor, + output: &'static [u8], + skip_data_read_len: usize, + } + /// The Lenovo/HPE two-byte escape (`ESC (`), used by most filtering rows. const ESC_PAREN: EscapeSequence = EscapeSequence::Pair((0x1b, &[0x28])); @@ -495,6 +542,75 @@ mod tests { } } + #[test] + fn fallback_serial_activate_commands_if_needed_detects_lenovo_failure() { + let lenovo_primary_failure = + b"console kill 1\r\nThe command line contains extraneous arguments\r\nsystem>"; + + value_scenarios!( + run = |case: FallbackCase| case.vendor + .fallback_serial_activate_commands_if_needed(case.output, case.fallback_sent) + .is_some(); + + "Lenovo SR650 v4 fallback" { + FallbackCase { vendor: SshBmcVendor::Lenovo, output: lenovo_primary_failure, fallback_sent: false } => true, + FallbackCase { vendor: SshBmcVendor::Lenovo, output: b"console kill 1\r\nThe command line contains extraneous arguments\r\n", fallback_sent: false } => false, + FallbackCase { vendor: SshBmcVendor::Lenovo, output: lenovo_primary_failure, fallback_sent: true } => false, + FallbackCase { vendor: SshBmcVendor::Dell, output: lenovo_primary_failure, fallback_sent: false } => false, + } + ); + + let commands = SshBmcVendor::Lenovo + .fallback_serial_activate_commands_if_needed(lenovo_primary_failure, false) + .expect("Lenovo failure should provide fallback commands"); + assert_eq!( + commands, + &[b"console kill".as_slice(), b"console start".as_slice()] + ); + } + + #[test] + fn should_accept_sol_activation_output_handles_fallback_cases() { + let bmc_prompt = SshBmcVendor::Lenovo.bmc_prompt().unwrap(); + let lenovo_primary_skip_len = bmc_prompt.len() + + SshBmcVendor::Lenovo + .serial_activate_command() + .unwrap() + .len(); + let lenovo_start_skip_len = bmc_prompt.len() + b"console start".len(); + + value_scenarios!( + run = |case: AcceptActivationCase| case.vendor.should_accept_sol_activation_output( + case.output, + case.skip_data_read_len, + ); + + "Lenovo primary failure waits for fallback" { + AcceptActivationCase { + vendor: SshBmcVendor::Lenovo, + output: b"console kill 1\r\nThe command line contains extraneous arguments\r\n", + skip_data_read_len: lenovo_primary_skip_len, + } => false, + } + + "Lenovo fallback start succeeds by byte count" { + AcceptActivationCase { + vendor: SshBmcVendor::Lenovo, + output: b"console start\r\nroot@host # ", + skip_data_read_len: lenovo_start_skip_len, + } => true, + } + + "non-Lenovo activation still succeeds by byte count" { + AcceptActivationCase { + vendor: SshBmcVendor::Dell, + output: b"connect com2\r\nready", + skip_data_read_len: b"connect com2".len(), + } => true, + } + ); + } + #[test] fn bmc_vendor_deserialize_rejects_an_unknown_string() { scenarios!( diff --git a/crates/ssh-console/tests/main.rs b/crates/ssh-console/tests/main.rs index ae231623ec..2c1ee0857d 100644 --- a/crates/ssh-console/tests/main.rs +++ b/crates/ssh-console/tests/main.rs @@ -160,6 +160,32 @@ async fn test_ssh_console() -> eyre::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread")] +async fn test_ssh_console_lenovo_sr650_sol_fallback() -> eyre::Result<()> { + if std::env::var("REPO_ROOT").is_err() { + tracing::info!("Skipping running ssh-console integration tests, as REPO_ROOT is not set"); + return Ok(()); + } + let Some(env) = run_baseline_test_environment(vec![MockBmcType::LenovoSr650Ssh]).await? else { + return Ok(()); + }; + + let handle = ssh_console_test_helper::spawn(env.mock_api_server.addr.port(), None).await?; + + env.run_baseline_assertions( + handle.addr, + "new-ssh-console Lenovo SR650", + &[BaselineTestAssertion::ConnectAsMachineId], + || None, + false, + ) + .await?; + + handle.spawn_handle.shutdown_and_wait().await; + + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn test_ssh_console_reconnect() -> eyre::Result<()> { if std::env::var("REPO_ROOT").is_err() { diff --git a/crates/ssh-console/tests/util/mod.rs b/crates/ssh-console/tests/util/mod.rs index 22b9c2db82..182982ff0a 100644 --- a/crates/ssh-console/tests/util/mod.rs +++ b/crates/ssh-console/tests/util/mod.rs @@ -112,21 +112,25 @@ pub fn log_stdout_and_stderr(process: &mut tokio::process::Child, prefix: &str) pub async fn run_baseline_test_environment( machines: Vec, ) -> eyre::Result> { - let mock_bmc_handles: Vec<(MockBmcHandle, MachineId)> = + let mock_bmc_handles: Vec<(MockBmcHandle, MachineId, MockBmcType)> = join_all(machines.iter().map(|bmc_type| { // Generate random machine ID's for each mocked host let machine_id = carbide_uuid::machine::MachineId::new( MachineIdSource::Tpm, rand::random(), match bmc_type { - MockBmcType::Ssh | MockBmcType::Ipmi => MachineType::Host, + MockBmcType::Ssh | MockBmcType::LenovoSr650Ssh | MockBmcType::Ipmi => { + MachineType::Host + } MockBmcType::DpuSsh => MachineType::Dpu, }, ); async move { let bmc_handle = match bmc_type { - ssh_type @ MockBmcType::Ssh | ssh_type @ MockBmcType::DpuSsh => { + ssh_type @ MockBmcType::Ssh + | ssh_type @ MockBmcType::LenovoSr650Ssh + | ssh_type @ MockBmcType::DpuSsh => { Ok::(MockBmcHandle::Ssh( machine_a_tron::spawn_mock_ssh_server( IpAddr::from_str("127.0.0.1").unwrap(), @@ -138,6 +142,7 @@ pub async fn run_baseline_test_environment( }), match ssh_type { MockBmcType::Ssh => PromptBehavior::Dell, + MockBmcType::LenovoSr650Ssh => PromptBehavior::LenovoSr650, MockBmcType::DpuSsh => PromptBehavior::Dpu, MockBmcType::Ipmi => unreachable!(), }, @@ -150,7 +155,7 @@ pub async fn run_baseline_test_environment( )), }?; - Ok::<_, eyre::Error>((bmc_handle, machine_id)) + Ok::<_, eyre::Error>((bmc_handle, machine_id, *bmc_type)) } })) .await @@ -161,12 +166,15 @@ pub async fn run_baseline_test_environment( let mock_hosts: Arc> = Arc::new( mock_bmc_handles .iter() - .map(|(bmc_handle, machine_id)| MockHost { + .map(|(bmc_handle, machine_id, bmc_type)| MockHost { machine_id: *machine_id, instance_id: Uuid::new_v4(), tenant_public_key: TENANT_SSH_PUBKEY.to_string(), sys_vendor: match &bmc_handle { - MockBmcHandle::Ssh(_) => "Dell", + MockBmcHandle::Ssh(_) => match bmc_type { + MockBmcType::LenovoSr650Ssh => "Lenovo", + _ => "Dell", + }, MockBmcHandle::Ipmi(_) => "Supermicro", }, bmc_ip: IpAddr::V4(Ipv4Addr::LOCALHOST), @@ -197,7 +205,7 @@ pub async fn run_baseline_test_environment( mock_api_server: api_server_handle, _mock_bmc_handles: mock_bmc_handles .into_iter() - .map(|(handle, _machine_id)| handle) + .map(|(handle, _machine_id, _bmc_type)| handle) .collect(), mock_hosts, })) @@ -206,6 +214,7 @@ pub async fn run_baseline_test_environment( #[derive(Debug, Clone, Copy)] pub enum MockBmcType { Ssh, + LenovoSr650Ssh, DpuSsh, Ipmi, }