Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 58 additions & 23 deletions crates/admin-cli/src/instance/reboot/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,72 @@
*/

use ::rpc::forge::{self as forgerpc, InstancePowerRequest};
use carbide_uuid::instance::InstanceId;
use eyre::WrapErr;

use super::args::Args;
use crate::errors::{CarbideCliError, CarbideCliResult};
use crate::rpc::ApiClient;

fn with_reboot_context<T>(
result: Result<T, tonic::Status>,
instance_id: InstanceId,
) -> CarbideCliResult<T> {
result
.wrap_err_with(|| format!("Failed to request reboot for instance {instance_id}"))
.map_err(CarbideCliError::from)
}

pub async fn handle_reboot(args: Args, api_client: &ApiClient) -> CarbideCliResult<()> {
let machine_id = api_client
.get_one_instance(args.instance)
.await?
.instances
.last()
.ok_or_else(|| CarbideCliError::GenericError("Unknown UUID".to_string()))?
.machine_id
.ok_or_else(|| {
CarbideCliError::GenericError("Instance has no machine associated.".to_string())
})?;

api_client
.0
.invoke_instance_power(InstancePowerRequest {
instance_id: Some(args.instance),
machine_id: Some(machine_id),
operation: forgerpc::instance_power_request::Operation::PowerReset as i32,
boot_with_custom_ipxe: args.custom_pxe,
apply_updates_on_reboot: args.apply_updates_on_reboot,
})
.await?;
with_reboot_context(
api_client
.0
.invoke_instance_power(InstancePowerRequest {
instance_id: Some(args.instance),
operation: forgerpc::instance_power_request::Operation::PowerReset as i32,
boot_with_custom_ipxe: args.custom_pxe,
apply_updates_on_reboot: args.apply_updates_on_reboot,
})
.await,
args.instance,
)?;
println!(
"Reboot for instance {} (machine {}) is requested successfully!",
args.instance, machine_id
"Reboot for instance {} is requested successfully!",
args.instance
);

Ok(())
}

#[cfg(test)]
mod tests {
use tonic::Code;

use super::*;

#[test]
fn reboot_error_includes_instance_context_and_preserves_source() {
let instance_id = "12345678-1234-5678-90ab-cdef01234567"
.parse::<InstanceId>()
.unwrap();

let error = with_reboot_context::<()>(
Err(tonic::Status::unavailable("API unavailable")),
instance_id,
)
.unwrap_err();

assert_eq!(
error.to_string(),
format!("Failed to request reboot for instance {instance_id}")
);
let CarbideCliError::EyreReport(report) = error else {
panic!("expected an EyreReport");
};
let status = report
.downcast_ref::<tonic::Status>()
.expect("tonic status should remain in the error chain");
assert_eq!(status.code(), Code::Unavailable);
assert_eq!(status.message(), "API unavailable");
}
}
64 changes: 14 additions & 50 deletions crates/api-core/src/handlers/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -835,56 +835,20 @@ pub(crate) async fn invoke_power(
let mut txn = api.txn_begin().await?;

let request = request.into_inner();

// Search by instance ID if provided, else by machine ID
let snapshot = if let Some(instance_id) = &request.instance_id {
let snapshot = db::managed_host::load_by_instance_ids(
&mut txn,
&[*instance_id],
LoadSnapshotOptions::default().with_host_health(api.runtime_config.host_health),
)
.await?
.pop()
.ok_or(CarbideError::NotFoundError {
kind: "instance",
id: instance_id.to_string(),
})?;

if let Some(machine_id) = &request.machine_id
&& *machine_id != snapshot.host_snapshot.id
{
return Err(CarbideError::InvalidArgument(format!(
"Instance {} is not hosted on machine {}",
instance_id, machine_id
))
.into());
}

snapshot
} else if let Some(machine_id) = &request.machine_id {
log_machine_id(machine_id);

let snapshot = db::managed_host::load_snapshot(
&mut txn,
machine_id,
LoadSnapshotOptions::default().with_host_health(api.runtime_config.host_health),
)
.await?
.ok_or(CarbideError::NotFoundError {
kind: "machine",
id: machine_id.to_string(),
})?;
if snapshot.instance.is_none() {
return Err(CarbideError::InvalidArgument(format!(
"Supplied machine ID does not match an instance: {machine_id}"
))
.into());
}

snapshot
} else {
return Err(CarbideError::MissingArgument("instance_id").into());
};
let instance_id = request
.instance_id
.ok_or(CarbideError::MissingArgument("instance_id"))?;
let snapshot = db::managed_host::load_by_instance_ids(
&mut txn,
&[instance_id],
LoadSnapshotOptions::default().with_host_health(api.runtime_config.host_health),
)
.await?
.pop()
.ok_or(CarbideError::NotFoundError {
kind: "instance",
id: instance_id.to_string(),
})?;
let machine_id = snapshot.host_snapshot.id;
log_machine_id(&machine_id);

Expand Down
5 changes: 0 additions & 5 deletions crates/api-core/src/tests/dpu_reprovisioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,6 @@ async fn instance_reprov_start(
env.api
.invoke_instance_power(tonic::Request::new(::rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
apply_updates_on_reboot: true,
boot_with_custom_ipxe: false,
operation: 0,
Expand Down Expand Up @@ -965,7 +964,6 @@ async fn test_instance_reprov_without_firmware_upgrade(pool: sqlx::PgPool) {
env.api
.invoke_instance_power(tonic::Request::new(::rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
apply_updates_on_reboot: true,
boot_with_custom_ipxe: false,
operation: 0,
Expand Down Expand Up @@ -1012,7 +1010,6 @@ async fn test_instance_reprov_without_firmware_upgrade(pool: sqlx::PgPool) {
env.api
.invoke_instance_power(tonic::Request::new(::rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
apply_updates_on_reboot: true,
boot_with_custom_ipxe: false,
operation: 0,
Expand Down Expand Up @@ -1923,7 +1920,6 @@ async fn test_instance_reprov_restart_failed_impl(pool: sqlx::PgPool) {
env.api
.invoke_instance_power(tonic::Request::new(::rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
apply_updates_on_reboot: true,
boot_with_custom_ipxe: false,
operation: 0,
Expand Down Expand Up @@ -1974,7 +1970,6 @@ async fn test_instance_reprov_restart_failed_impl(pool: sqlx::PgPool) {
env.api
.invoke_instance_power(tonic::Request::new(::rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
apply_updates_on_reboot: true,
boot_with_custom_ipxe: false,
operation: 0,
Expand Down
1 change: 0 additions & 1 deletion crates/api-core/src/tests/host_bmc_firmware_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,6 @@ async fn test_instance_upgrading_actual(
// Simulate a tenant OKing the request
let request = rpc::forge::InstancePowerRequest {
instance_id: tinstance.id.into(),
machine_id: None,
operation: rpc::forge::instance_power_request::Operation::PowerReset.into(),
boot_with_custom_ipxe: false,
apply_updates_on_reboot: true,
Expand Down
30 changes: 9 additions & 21 deletions crates/api-core/src/tests/instance_ipxe_behaviors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,33 +124,22 @@ async fn test_instance_uses_custom_ipxe_only_once(pool: sqlx::PgPool) {
assert!(pxe.pxe_script.contains(
"This state assumes an OS is provisioned and will exit into the OS in 5 seconds."
));
}

// A reboot should also be possible with just MachineId
// TODO: Remove these assertions after the `machine_id` based reboots are removed.
env.api
.invoke_instance_power(tonic::Request::new(rpc::forge::InstancePowerRequest {
instance_id: None,
machine_id: Some(mh.id),
operation: rpc::forge::instance_power_request::Operation::PowerReset as _,
boot_with_custom_ipxe: false,
apply_updates_on_reboot: false,
}))
.await
.unwrap();
#[crate::sqlx_test]
async fn invoke_instance_power_requires_instance_id(pool: sqlx::PgPool) {
let env = create_test_env(pool).await;

// A request with mismatching Machine and InstanceId should fail
let err = env
.api
.invoke_instance_power(tonic::Request::new(rpc::forge::InstancePowerRequest {
instance_id: Some(tinstance.id),
machine_id: Some(mh.dpu_ids[0]),
operation: rpc::forge::instance_power_request::Operation::PowerReset as _,
boot_with_custom_ipxe: false,
apply_updates_on_reboot: false,
}))
.invoke_instance_power(tonic::Request::new(
rpc::forge::InstancePowerRequest::default(),
))
.await
.unwrap_err();

assert_eq!(err.code(), tonic::Code::InvalidArgument);
assert!(err.message().contains("instance_id"));
}

#[crate::sqlx_test]
Expand Down Expand Up @@ -201,7 +190,6 @@ async fn invoke_instance_power(
env.api
.invoke_instance_power(tonic::Request::new(rpc::forge::InstancePowerRequest {
instance_id: Some(instance_id),
machine_id: None,
operation: rpc::forge::instance_power_request::Operation::PowerReset as _,
boot_with_custom_ipxe,
apply_updates_on_reboot: false,
Expand Down
5 changes: 2 additions & 3 deletions crates/rpc/proto/forge.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2543,9 +2543,8 @@ message MachineState {
}

message InstancePowerRequest {
// The Machine to perform power operation on.
// Deprecated: User `instance_id` field instead.
common.MachineId machine_id = 1;
reserved 1;
reserved "machine_id";

enum Operation {
POWER_RESET = 0;
Expand Down
13 changes: 9 additions & 4 deletions dev/bin/reprovision_dpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ echo "Found machine with host: $HOST_MACHINE_ID and DPU: $DPU_MACHINE_ID."
# Check Instance state
INSTANCE_ID=$(grpcurl -d '{}' -insecure "${API_SERVER}" forge.Forge/FindInstances | jq ".instances[0].id.value" | tr -d '"')

if [[ "$INSTANCE_ID" != "null" ]]; then
if [[ -n "$INSTANCE_ID" && "$INSTANCE_ID" != "null" ]]; then
INSTANCE_STATE=$(grpcurl -d "{\"id\": {\"value\": \"$INSTANCE_ID\"}}" -insecure "${API_SERVER}" forge.Forge/FindInstances | jq ".instances[0].status.tenant.state" | tr -d '"')
echo "Instance found with ID $INSTANCE_ID in state $INSTANCE_STATE"
fi
Expand All @@ -64,9 +64,14 @@ echo "Triggering DPU reprovision."
grpcurl -d "{\"dpu_id\": { \"id\": \"$DPU_MACHINE_ID\" }, \"mode\": 0, \"initiator\": 0, \"update_firmware\": true}" -insecure "${API_SERVER}" forge.Forge/TriggerDpuReprovisioning

# In case of Instance, reprovision will be triggered after user approval.
if [[ "$INSTANCE_ID" != "null" ]]; then
if [[ -n "$INSTANCE_ID" && "$INSTANCE_ID" != "null" ]]; then
echo "Sending reboot message with apply_updates_on_reboot true".
grpcurl -d "{\"operation\": 0, \"machine_id\": { \"id\": \"$HOST_MACHINE_ID\" }, \"apply_updates_on_reboot\": true}" -insecure "${API_SERVER}" forge.Forge/InvokeInstancePower
if ! grpcurl -d "{\"operation\": 0, \"instance_id\": { \"value\": \"$INSTANCE_ID\" }, \"apply_updates_on_reboot\": true}" -insecure "${API_SERVER}" forge.Forge/InvokeInstancePower; then
echo "Failed to send reboot approval for instance $INSTANCE_ID." >&2
exit 1
fi
else
echo "No instance found; skipping tenant reboot approval."
Comment thread
coderabbitai[bot] marked this conversation as resolved.
fi

#
Expand Down Expand Up @@ -138,7 +143,7 @@ export PREV_PATH=$PATH
export PATH=${REPO_ROOT}/dev/bin:$PATH
cargo run -p agent -- --config-path "$DPU_CONFIG_FILE" run --override-machine-id ${DPU_MACHINE_ID} &

if [[ "$INSTANCE_ID" == "null" ]]; then # No instance is configured
if [[ -z "$INSTANCE_ID" || "$INSTANCE_ID" == "null" ]]; then # No instance is configured
# Next state is Discovered.
i=0
while [[ $i -lt $MAX_RETRY ]]; do
Expand Down
1 change: 1 addition & 0 deletions docs/observability/core_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo).
<tr><td>carbide_site_explorer_created_power_shelves_count</td><td>gauge</td><td>The amount of Power Shelves that had been created by Site Explorer after being identified</td></tr>
<tr><td>carbide_site_explorer_enabled</td><td>gauge</td><td>Whether site-explorer is enabled (1) or paused (0)</td></tr>
<tr><td>carbide_site_explorer_iteration_latency_milliseconds</td><td>histogram</td><td>The time it took to perform one site explorer iteration</td></tr>
<tr><td>carbide_site_explorer_last_run_status</td><td>gauge</td><td>The status of the latest Site Explorer run</td></tr>
<tr><td>carbide_site_explorer_phase_latency_milliseconds</td><td>histogram</td><td>The time it took to perform one site explorer iteration phase</td></tr>
<tr><td>carbide_site_explorer_update_explored_endpoints_count</td><td>gauge</td><td>Counts from the last update_explored_endpoints phase by kind</td></tr>
<tr><td>carbide_switches_enqueuer_iteration_latency_milliseconds</td><td>histogram</td><td>The overall time it took to enqueue state handling tasks for all carbide_switches in the system</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion rest-api/api/pkg/api/handler/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -1842,7 +1842,7 @@ func (uih UpdateInstanceHandler) handleReboot(c echo.Context, logger *zerolog.Lo

// Prepare the config update request workflow object
rebootInstanceRequest := &cwssaws.InstancePowerRequest{
MachineId: &cwssaws.MachineId{Id: *instance.MachineID},
InstanceId: &cwssaws.InstanceId{Value: instance.GetSiteID().String()},
Operation: cwssaws.InstancePowerRequest_POWER_RESET,
BootWithCustomIpxe: rebootWithCustomIpxe,
ApplyUpdatesOnReboot: applyUpdatesOnReboot,
Expand Down
Loading
Loading