diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index 9ac225b7cb..e445304219 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -1864,10 +1864,7 @@ impl MachineStateHandler { ); } ManagedHostState::Failed { .. } - if is_unassigned_dpu_reprovision_host_boot_failure( - managed_state, - host_machine_id, - ) => + if is_reprovision_restartable_failure(managed_state, host_machine_id) => { set_managed_host_topology_update_needed( ctx.pending_db_writes, @@ -1875,7 +1872,7 @@ impl MachineStateHandler { &dpus_for_reprov, ); - // Host boot repair failures leave the host in top-level Failed; restart must + // These failures leave the host in top-level Failed; restart must // reconstruct the reprovision map instead of trying to advance from Failed. next_state = Some( ReprovisionState::next_substate_based_on_bfb_support( @@ -1912,10 +1909,11 @@ impl MachineStateHandler { } } -fn is_unassigned_dpu_reprovision_host_boot_failure( +fn is_reprovision_restartable_failure( managed_state: &ManagedHostState, host_machine_id: &MachineId, ) -> bool { + // BiosSetupFailed is always attributed to the host itself. matches!( managed_state, ManagedHostState::Failed { @@ -1928,6 +1926,18 @@ fn is_unassigned_dpu_reprovision_host_boot_failure( }, .. } if machine_id == host_machine_id + ) || + // DpfProvisioning may be attributed to a specific DPU (e.g. DPU entered + // error phase), so we do not guard on machine_id or source here. + matches!( + managed_state, + ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::DpfProvisioning { .. }, + .. + }, + .. + } ) } @@ -11703,4 +11713,59 @@ mod tests { let cycle = get_reboot_cycle(expected_time, state_change_time, wait_period).unwrap(); assert_eq!(cycle, 30); } + + #[test] + fn is_reprovision_restartable_failure_matches_expected_causes() { + let host_id = + MachineId::from_str("fm100htes3rn1npvbtm5qd57dkilaag7ljugl1llmm7rfuq1ov50i0rpl30") + .unwrap(); + let dpu_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let make_failed = |cause: FailureCause, machine_id: MachineId| ManagedHostState::Failed { + details: FailureDetails { + cause, + failed_at: chrono::Utc::now(), + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + }, + machine_id, + retry_count: 0, + }; + + // BiosSetupFailed on host → restartable + assert!(is_reprovision_restartable_failure( + &make_failed( + FailureCause::BiosSetupFailed { err: "bios".into() }, + host_id + ), + &host_id, + )); + + // DpfProvisioning on host → restartable + assert!(is_reprovision_restartable_failure( + &make_failed(FailureCause::DpfProvisioning { err: "dpf".into() }, host_id), + &host_id, + )); + + // DpfProvisioning attributed to a DPU → still restartable (no machine_id guard) + assert!(is_reprovision_restartable_failure( + &make_failed(FailureCause::DpfProvisioning { err: "dpf".into() }, dpu_id), + &host_id, + )); + + // DpfProvisioning on host with any FailureSource → restartable + assert!(is_reprovision_restartable_failure( + &ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::DpfProvisioning { err: "dpf".into() }, + failed_at: chrono::Utc::now(), + source: FailureSource::StateMachineArea(StateMachineArea::HostInit), + }, + machine_id: host_id, + retry_count: 0, + }, + &host_id, + )); + } } diff --git a/crates/machine-controller/src/handler/dpf.rs b/crates/machine-controller/src/handler/dpf.rs index d513462ac5..b88a43b40b 100644 --- a/crates/machine-controller/src/handler/dpf.rs +++ b/crates/machine-controller/src/handler/dpf.rs @@ -227,23 +227,45 @@ async fn create_and_register_dpudevices_and_dpunode( Ok(()) } -fn dpf_cr_creation_failed( +/// Build the correct failure state depending on whether the host is currently +/// `Assigned` (DPU reprovision path). When `Assigned`, we preserve the outer +/// state and embed the failure as `InstanceState::Failed`; otherwise we use +/// the top-level `ManagedHostState::Failed`. +fn make_failure_state( + state: &ManagedHostStateSnapshot, + details: FailureDetails, machine_id: MachineId, +) -> ManagedHostState { + if matches!(state.managed_state, ManagedHostState::Assigned { .. }) { + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details, + machine_id, + }, + } + } else { + ManagedHostState::Failed { + details, + machine_id, + retry_count: 0, + } + } +} + +fn dpf_cr_creation_failed( + state: &ManagedHostStateSnapshot, err: &StateHandlerError, ) -> StateHandlerOutcome { - StateHandlerOutcome::transition(ManagedHostState::Failed { - details: FailureDetails { - cause: FailureCause::DpfProvisioning { - err: format!( - "DPUDevice/DPUNode creation failed. Force-delete again to clean old values. Wait until DPU CR are deleted. {err}" - ), - }, - failed_at: chrono::Utc::now(), - source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + let details = FailureDetails { + cause: FailureCause::DpfProvisioning { + err: format!( + "DPUDevice/DPUNode creation failed. Force-delete/restart reprovisioning (reprovisioning case) to clean old values. Wait until DPU CR are deleted. {err}" + ), }, - machine_id, - retry_count: 0, - }) + failed_at: chrono::Utc::now(), + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + }; + StateHandlerOutcome::transition(make_failure_state(state, details, state.host_snapshot.id)) } /// Handle DpfState::Provisioning: register all DPU devices and the node, then @@ -253,7 +275,7 @@ async fn handle_dpf_provisioning( dpf_sdk: &dyn DpfOperations, ) -> Result, StateHandlerError> { if let Err(err) = create_and_register_dpudevices_and_dpunode(state, dpf_sdk).await { - return Ok(dpf_cr_creation_failed(state.host_snapshot.id, &err)); + return Ok(dpf_cr_creation_failed(state, &err)); } let next = @@ -349,20 +371,21 @@ async fn handle_dpf_waiting_for_ready( dpu = %dpu_snapshot.id, "DPU entered error phase during DPF provisioning" ); - return Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { - details: FailureDetails { - cause: FailureCause::DpfProvisioning { - err: format!( - "DPU {} entered error phase during DPF provisioning", - dpu_snapshot.id - ), - }, - failed_at: chrono::Utc::now(), - source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + let details = FailureDetails { + cause: FailureCause::DpfProvisioning { + err: format!( + "DPU {} entered error phase during DPF provisioning", + dpu_snapshot.id + ), }, - machine_id: dpu_snapshot.id, - retry_count: 0, - })); + failed_at: chrono::Utc::now(), + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + }; + return Ok(StateHandlerOutcome::transition(make_failure_state( + state, + details, + dpu_snapshot.id, + ))); } // wait for dpf to report that the dpu is ready if current_phase != carbide_dpf::DpuPhase::Ready { @@ -415,7 +438,7 @@ async fn handle_dpf_reprovisioning( "DPUDevice/DPUNode CRs do not exist, creating them before reprovisioning" ); if let Err(err) = create_and_register_dpudevices_and_dpunode(state, dpf_sdk).await { - return Ok(dpf_cr_creation_failed(state.host_snapshot.id, &err)); + return Ok(dpf_cr_creation_failed(state, &err)); } let next = transition_all_dpus_to_dpf_state( DpfState::WaitingForReady { phase_detail: None }, @@ -466,20 +489,21 @@ pub async fn handle_dpf_state( node = %node_name, "DPUNode has stale labels, failing for reprovisioning" ); - return Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { - details: FailureDetails { - cause: FailureCause::DpfProvisioning { - err: format!( - "DPUNode {node_name} has stale labels; \ - must be deleted and reprovisioned" - ), - }, - failed_at: chrono::Utc::now(), - source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + let details = FailureDetails { + cause: FailureCause::DpfProvisioning { + err: format!( + "DPUNode {node_name} has stale labels; \ + must be deleted and reprovisioned" + ), }, - machine_id: state.host_snapshot.id, - retry_count: 0, - })); + failed_at: chrono::Utc::now(), + source: FailureSource::StateMachineArea(StateMachineArea::MainFlow), + }; + return Ok(StateHandlerOutcome::transition(make_failure_state( + state, + details, + state.host_snapshot.id, + ))); } match dpf_state {