Skip to content

Commit 3626d5c

Browse files
authored
[RayCluster] Add more context why we don't recreate head Pod for RayJob (#4175)
Signed-off-by: Kai-Hsun Chen <[email protected]>
1 parent 9eb8e3a commit 3626d5c

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,17 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
621621
} else if len(headPods.Items) == 0 {
622622
originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey])
623623
if originatedFrom == utils.RayJobCRD {
624+
// Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob.
625+
//
626+
// Case 1: GCS fault tolerance is disabled
627+
//
628+
// In this case, the worker Pods will be killed by the new head Pod when it is created, so the new Ray job will not be running in
629+
// a "provisioned" cluster.
630+
//
631+
// Case 2: GCS fault tolerance is enabled
632+
//
633+
// In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been
634+
// used by the old Ray job, so the new Ray job will fail.
624635
if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) {
625636
logger.Info(
626637
"reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",

0 commit comments

Comments
 (0)