Skip to content

Commit c21938d

Browse files
[Bug] Sidecar mode shouldn't restart head pod when head pod is deleted (#4141)
* [Bug] Sidecar mode shouldn't restart head pod when head pod is deleted Signed-off-by: 400Ping <[email protected]> * [Fix] Fix e2e error Signed-off-by: 400Ping <[email protected]> * [Fix] fix according to rueian's comment Signed-off-by: 400Ping <[email protected]> * [Chore] fix ci error Signed-off-by: 400Ping <[email protected]> * Update ray-operator/controllers/ray/raycluster_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) <[email protected]> Signed-off-by: Ping <[email protected]> * Update ray-operator/controllers/ray/rayjob_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) <[email protected]> Signed-off-by: Ping <[email protected]> * update Signed-off-by: Future-Outlier <[email protected]> * update Signed-off-by: Future-Outlier <[email protected]> * Trigger CI Signed-off-by: Future-Outlier <[email protected]> --------- Signed-off-by: 400Ping <[email protected]> Signed-off-by: Ping <[email protected]> Signed-off-by: Future-Outlier <[email protected]> Co-authored-by: Han-Ju Chen (Future-Outlier) <[email protected]>
1 parent 530318b commit c21938d

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,16 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
619619
return errstd.New(reason)
620620
}
621621
} else if len(headPods.Items) == 0 {
622+
originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey])
623+
if originatedFrom == utils.RayJobCRD {
624+
if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) {
625+
logger.Info(
626+
"reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",
627+
"rayCluster", instance.Name,
628+
)
629+
return nil
630+
}
631+
}
622632
// Create head Pod if it does not exist.
623633
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
624634
if err := r.createHeadPod(ctx, *instance); err != nil {

ray-operator/test/e2erayjob/rayjob_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,10 @@ env_vars:
310310
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
311311
Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed)))
312312
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
313-
Should(WithTransform(RayJobReason, Equal(rayv1.AppFailed)))
313+
Should(WithTransform(RayJobReason, Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded)))
314314
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
315315
Should(WithTransform(func(job *rayv1.RayJob) string { return job.Status.Message },
316-
Equal("Submitter completed but Ray job not found in RayCluster.")))
316+
MatchRegexp("The RayJob submitter finished at .* but the ray job did not reach terminal state within .*")))
317317

318318
// Cleanup
319319
err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})

0 commit comments

Comments
 (0)