Skip to content

Commit f4c9d27

Browse files
committed
Add K8sJobMode check to prevent YuniKorn from adding submitter pod annotations
Signed-off-by: win5923 <[email protected]>
1 parent a33a3b7 commit f4c9d27

File tree

2 files changed

+23
-10
lines changed

2 files changed

+23
-10
lines changed

ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ func (v *VolcanoBatchScheduler) handleRayJob(ctx context.Context, rayJob *rayv1.
8080
// submitter's resource requests into MinResources so capacity is reserved.
8181
if rayJob.Spec.SubmissionMode == rayv1.K8sJobMode {
8282
submitterTemplate := common.GetSubmitterTemplate(&rayJob.Spec, rayJob.Spec.RayClusterSpec)
83-
submitResource := utils.CalculatePodResource(submitterTemplate.Spec)
84-
totalResourceList = append(totalResourceList, submitResource)
83+
submitterResource := utils.CalculatePodResource(submitterTemplate.Spec)
84+
totalResourceList = append(totalResourceList, submitterResource)
8585
}
8686

8787
return v.syncPodGroup(ctx, rayJob, minMember, utils.SumResourceList(totalResourceList))

ray-operator/controllers/ray/rayjob_controller.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,23 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
279279
jobInfo, err := rayDashboardClient.GetJobInfo(ctx, rayJobInstance.Status.JobId)
280280
if err != nil {
281281
// If the Ray job was not found, GetJobInfo returns a BadRequest error.
282-
if rayJobInstance.Spec.SubmissionMode == rayv1.HTTPMode && errors.IsBadRequest(err) {
283-
logger.Info("The Ray job was not found. Submit a Ray job via an HTTP request.", "JobId", rayJobInstance.Status.JobId)
284-
if _, err := rayDashboardClient.SubmitJob(ctx, rayJobInstance); err != nil {
285-
logger.Error(err, "Failed to submit the Ray job", "JobId", rayJobInstance.Status.JobId)
286-
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
282+
if errors.IsBadRequest(err) {
283+
if rayJobInstance.Spec.SubmissionMode == rayv1.HTTPMode {
284+
logger.Info("The Ray job was not found. Submit a Ray job via an HTTP request.", "JobId", rayJobInstance.Status.JobId)
285+
if _, err := rayDashboardClient.SubmitJob(ctx, rayJobInstance); err != nil {
286+
logger.Error(err, "Failed to submit the Ray job", "JobId", rayJobInstance.Status.JobId)
287+
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
288+
}
289+
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
290+
}
291+
if isSubmitterFinished {
292+
rayJobInstance.Status.JobDeploymentStatus = rayv1.JobDeploymentStatusFailed
293+
rayJobInstance.Status.Reason = rayv1.AppFailed
294+
rayJobInstance.Status.Message = "Submitter completed but Ray job not found in RayCluster."
295+
break
287296
}
288-
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
289297
}
298+
290299
logger.Error(err, "Failed to get job info", "JobId", rayJobInstance.Status.JobId)
291300
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
292301
}
@@ -937,7 +946,7 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
937946
if err != nil {
938947
return nil, err
939948
}
940-
if r.options.BatchSchedulerManager != nil {
949+
if r.options.BatchSchedulerManager != nil && rayJobInstance.Spec.SubmissionMode == rayv1.K8sJobMode {
941950
if scheduler, err := r.options.BatchSchedulerManager.GetScheduler(); err == nil {
942951
// Group name is only used for individual pods to specify their task group ("headgroup", "worker-group-1", etc.).
943952
// RayCluster contains multiple groups, so we pass an empty string.
@@ -1050,7 +1059,11 @@ func (r *RayJobReconciler) checkSubmitterAndUpdateStatusIfNeeded(ctx context.Con
10501059
}
10511060

10521061
if headPod == nil {
1053-
logger.Info("Ray head pod not found, skipping sidecar container status check")
1062+
// If head pod is deleted, mark the RayJob as failed
1063+
shouldUpdate = true
1064+
rayJob.Status.JobDeploymentStatus = rayv1.JobDeploymentStatusFailed
1065+
rayJob.Status.Reason = rayv1.AppFailed
1066+
rayJob.Status.Message = "Ray head pod not found."
10541067
return
10551068
}
10561069

0 commit comments

Comments
 (0)