Skip to content

Commit 0038806

Browse files
dulinrileyfacebook-github-bot
authored andcommitted
Ignore mailbox errors in GetState for HostMeshAgent and ProcMeshAgent (#1691)
Summary: If some actor owns some ActorMeshes, it will periodically send out a GetState message to the HostMeshAgent and ProcMeshAgent. If that sender crashes while waiting for a reply, it'll cause a MailboxSenderError on the agents. We don't want those agents to stop because of such an error, as it just means nobody will receive a reply. GetRankStatus and GetState messages are read-only and have no side effects, so it's fine to just warn on the MailboxSenderError, there is no invalid state left hanging around. Reviewed By: vidhyav, xunnanxu Differential Revision: D85720817
1 parent 17de318 commit 0038806

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed

hyperactor_mesh/src/proc_mesh/mesh_agent.rs

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -669,8 +669,18 @@ impl Handler<resource::GetRankStatus> for ProcMeshAgent {
669669
StatusOverlay::try_from_runs(vec![(rank..(rank + 1), status)])
670670
.expect("valid single-run overlay")
671671
};
672-
get_rank_status.reply.send(cx, overlay)?;
673-
672+
let result = get_rank_status.reply.send(cx, overlay);
673+
// Ignore errors, because returning Err from here would cause the ProcMeshAgent
674+
// to be stopped, which would prevent querying and spawning other actors.
675+
// This only means some actor that requested the state of an actor failed to receive it.
676+
if let Err(e) = result {
677+
tracing::warn!(
678+
actor = %cx.self_id(),
679+
"failed to send GetRankStatus reply to {} due to error: {}",
680+
get_rank_status.reply.port_id().actor_id(),
681+
e
682+
);
683+
}
674684
Ok(())
675685
}
676686
}
@@ -724,7 +734,18 @@ impl Handler<resource::GetState<ActorState>> for ProcMeshAgent {
724734
},
725735
};
726736

727-
get_state.reply.send(cx, state)?;
737+
let result = get_state.reply.send(cx, state);
738+
// Ignore errors, because returning Err from here would cause the ProcMeshAgent
739+
// to be stopped, which would prevent querying and spawning other actors.
740+
// This only means some actor that requested the state of an actor failed to receive it.
741+
if let Err(e) = result {
742+
tracing::warn!(
743+
actor = %cx.self_id(),
744+
"failed to send GetState reply to {} due to error: {}",
745+
get_state.reply.port_id().actor_id(),
746+
e
747+
);
748+
}
728749
Ok(())
729750
}
730751
}

hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,18 @@ impl Handler<resource::GetRankStatus> for HostMeshAgent {
290290
StatusOverlay::try_from_runs(vec![(rank..(rank + 1), status)])
291291
.expect("valid single-run overlay")
292292
};
293-
get_rank_status.reply.send(cx, overlay)?;
293+
let result = get_rank_status.reply.send(cx, overlay);
294+
// Ignore errors, because returning Err from here would cause the HostMeshAgent
295+
// to be stopped, which would take down the entire host. This only means
296+
// some actor that requested the rank status failed to receive it.
297+
if let Err(e) = result {
298+
tracing::warn!(
299+
actor = %cx.self_id(),
300+
"failed to send GetRankStatus reply to {} due to error: {}",
301+
get_rank_status.reply.port_id().actor_id(),
302+
e
303+
);
304+
}
294305
Ok(())
295306
}
296307
}
@@ -403,7 +414,18 @@ impl Handler<resource::GetState<ProcState>> for HostMeshAgent {
403414
},
404415
};
405416

406-
get_state.reply.send(cx, state)?;
417+
let result = get_state.reply.send(cx, state);
418+
// Ignore errors, because returning Err from here would cause the HostMeshAgent
419+
// to be stopped, which would take down the entire host. This only means
420+
// some actor that requested the state of a proc failed to receive it.
421+
if let Err(e) = result {
422+
tracing::warn!(
423+
actor = %cx.self_id(),
424+
"failed to send GetState reply to {} due to error: {}",
425+
get_state.reply.port_id().actor_id(),
426+
e
427+
);
428+
}
407429
Ok(())
408430
}
409431
}

0 commit comments

Comments
 (0)