Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 71 additions & 6 deletions crates/machine-controller/src/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1864,18 +1864,15 @@ impl MachineStateHandler {
);
}
ManagedHostState::Failed { .. }
if is_unassigned_dpu_reprovision_host_boot_failure(
managed_state,
host_machine_id,
) =>
if is_reprovision_restartable_failure(managed_state, host_machine_id) =>
{
set_managed_host_topology_update_needed(
ctx.pending_db_writes,
&state.host_snapshot,
&dpus_for_reprov,
);

// Host boot repair failures leave the host in top-level Failed; restart must
// These failures leave the host in top-level Failed; restart must
// reconstruct the reprovision map instead of trying to advance from Failed.
next_state = Some(
ReprovisionState::next_substate_based_on_bfb_support(
Expand Down Expand Up @@ -1912,10 +1909,11 @@ impl MachineStateHandler {
}
}

fn is_unassigned_dpu_reprovision_host_boot_failure(
fn is_reprovision_restartable_failure(
managed_state: &ManagedHostState,
host_machine_id: &MachineId,
) -> bool {
// BiosSetupFailed is always attributed to the host itself.
matches!(
managed_state,
ManagedHostState::Failed {
Expand All @@ -1928,6 +1926,18 @@ fn is_unassigned_dpu_reprovision_host_boot_failure(
},
..
} if machine_id == host_machine_id
) ||
// DpfProvisioning may be attributed to a specific DPU (e.g. DPU entered
// error phase), so we do not guard on machine_id or source here.
matches!(
managed_state,
ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::DpfProvisioning { .. },
..
},
..
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)
}

Expand Down Expand Up @@ -11703,4 +11713,59 @@ mod tests {
let cycle = get_reboot_cycle(expected_time, state_change_time, wait_period).unwrap();
assert_eq!(cycle, 30);
}

#[test]
fn is_reprovision_restartable_failure_matches_expected_causes() {
let host_id =
MachineId::from_str("fm100htes3rn1npvbtm5qd57dkilaag7ljugl1llmm7rfuq1ov50i0rpl30")
.unwrap();
let dpu_id =
MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng")
.unwrap();

let make_failed = |cause: FailureCause, machine_id: MachineId| ManagedHostState::Failed {
details: FailureDetails {
cause,
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
},
machine_id,
retry_count: 0,
};

// BiosSetupFailed on host → restartable
assert!(is_reprovision_restartable_failure(
&make_failed(
FailureCause::BiosSetupFailed { err: "bios".into() },
host_id
),
&host_id,
));

// DpfProvisioning on host → restartable
assert!(is_reprovision_restartable_failure(
&make_failed(FailureCause::DpfProvisioning { err: "dpf".into() }, host_id),
&host_id,
));

// DpfProvisioning attributed to a DPU → still restartable (no machine_id guard)
assert!(is_reprovision_restartable_failure(
&make_failed(FailureCause::DpfProvisioning { err: "dpf".into() }, dpu_id),
&host_id,
));

// DpfProvisioning on host with any FailureSource → restartable
assert!(is_reprovision_restartable_failure(
&ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::DpfProvisioning { err: "dpf".into() },
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::HostInit),
},
machine_id: host_id,
retry_count: 0,
},
&host_id,
));
}
}
106 changes: 65 additions & 41 deletions crates/machine-controller/src/handler/dpf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,23 +227,45 @@ async fn create_and_register_dpudevices_and_dpunode(
Ok(())
}

fn dpf_cr_creation_failed(
/// Build the correct failure state depending on whether the host is currently
/// `Assigned` (DPU reprovision path). When `Assigned`, we preserve the outer
/// state and embed the failure as `InstanceState::Failed`; otherwise we use
/// the top-level `ManagedHostState::Failed`.
fn make_failure_state(
state: &ManagedHostStateSnapshot,
details: FailureDetails,
machine_id: MachineId,
) -> ManagedHostState {
if matches!(state.managed_state, ManagedHostState::Assigned { .. }) {
Comment thread
wminckler marked this conversation as resolved.
ManagedHostState::Assigned {
instance_state: InstanceState::Failed {
details,
machine_id,
},
}
} else {
ManagedHostState::Failed {
details,
machine_id,
retry_count: 0,
}
}
}

fn dpf_cr_creation_failed(
state: &ManagedHostStateSnapshot,
err: &StateHandlerError,
) -> StateHandlerOutcome<ManagedHostState> {
StateHandlerOutcome::transition(ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPUDevice/DPUNode creation failed. Force-delete again to clean old values. Wait until DPU CR are deleted. {err}"
),
},
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
let details = FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPUDevice/DPUNode creation failed. Force-delete/restart reprovisioning (reprovisioning case) to clean old values. Wait until DPU CR are deleted. {err}"
),
},
machine_id,
retry_count: 0,
})
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
};
StateHandlerOutcome::transition(make_failure_state(state, details, state.host_snapshot.id))
}

/// Handle DpfState::Provisioning: register all DPU devices and the node, then
Expand All @@ -253,7 +275,7 @@ async fn handle_dpf_provisioning(
dpf_sdk: &dyn DpfOperations,
) -> Result<StateHandlerOutcome<ManagedHostState>, StateHandlerError> {
if let Err(err) = create_and_register_dpudevices_and_dpunode(state, dpf_sdk).await {
return Ok(dpf_cr_creation_failed(state.host_snapshot.id, &err));
return Ok(dpf_cr_creation_failed(state, &err));
}

let next =
Expand Down Expand Up @@ -349,20 +371,21 @@ async fn handle_dpf_waiting_for_ready(
dpu = %dpu_snapshot.id,
"DPU entered error phase during DPF provisioning"
);
return Ok(StateHandlerOutcome::transition(ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPU {} entered error phase during DPF provisioning",
dpu_snapshot.id
),
},
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
let details = FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPU {} entered error phase during DPF provisioning",
dpu_snapshot.id
),
},
machine_id: dpu_snapshot.id,
retry_count: 0,
}));
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
};
return Ok(StateHandlerOutcome::transition(make_failure_state(
state,
details,
dpu_snapshot.id,
)));
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
// wait for dpf to report that the dpu is ready
if current_phase != carbide_dpf::DpuPhase::Ready {
Expand Down Expand Up @@ -415,7 +438,7 @@ async fn handle_dpf_reprovisioning(
"DPUDevice/DPUNode CRs do not exist, creating them before reprovisioning"
);
if let Err(err) = create_and_register_dpudevices_and_dpunode(state, dpf_sdk).await {
return Ok(dpf_cr_creation_failed(state.host_snapshot.id, &err));
return Ok(dpf_cr_creation_failed(state, &err));
}
let next = transition_all_dpus_to_dpf_state(
DpfState::WaitingForReady { phase_detail: None },
Expand Down Expand Up @@ -466,20 +489,21 @@ pub async fn handle_dpf_state(
node = %node_name,
"DPUNode has stale labels, failing for reprovisioning"
);
return Ok(StateHandlerOutcome::transition(ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPUNode {node_name} has stale labels; \
must be deleted and reprovisioned"
),
},
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
let details = FailureDetails {
cause: FailureCause::DpfProvisioning {
err: format!(
"DPUNode {node_name} has stale labels; \
must be deleted and reprovisioned"
),
},
machine_id: state.host_snapshot.id,
retry_count: 0,
}));
failed_at: chrono::Utc::now(),
source: FailureSource::StateMachineArea(StateMachineArea::MainFlow),
};
return Ok(StateHandlerOutcome::transition(make_failure_state(
state,
details,
state.host_snapshot.id,
)));
}

match dpf_state {
Expand Down
Loading