@@ -279,14 +279,23 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
279
279
jobInfo , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
280
280
if err != nil {
281
281
// If the Ray job was not found, GetJobInfo returns a BadRequest error.
282
- if rayJobInstance .Spec .SubmissionMode == rayv1 .HTTPMode && errors .IsBadRequest (err ) {
283
- logger .Info ("The Ray job was not found. Submit a Ray job via an HTTP request." , "JobId" , rayJobInstance .Status .JobId )
284
- if _ , err := rayDashboardClient .SubmitJob (ctx , rayJobInstance ); err != nil {
285
- logger .Error (err , "Failed to submit the Ray job" , "JobId" , rayJobInstance .Status .JobId )
286
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
282
+ if errors .IsBadRequest (err ) {
283
+ if rayJobInstance .Spec .SubmissionMode == rayv1 .HTTPMode {
284
+ logger .Info ("The Ray job was not found. Submit a Ray job via an HTTP request." , "JobId" , rayJobInstance .Status .JobId )
285
+ if _ , err := rayDashboardClient .SubmitJob (ctx , rayJobInstance ); err != nil {
286
+ logger .Error (err , "Failed to submit the Ray job" , "JobId" , rayJobInstance .Status .JobId )
287
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
288
+ }
289
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
290
+ }
291
+ if isSubmitterFinished {
292
+ rayJobInstance .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusFailed
293
+ rayJobInstance .Status .Reason = rayv1 .AppFailed
294
+ rayJobInstance .Status .Message = "Submitter completed but Ray job not found in RayCluster."
295
+ break
287
296
}
288
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
289
297
}
298
+
290
299
logger .Error (err , "Failed to get job info" , "JobId" , rayJobInstance .Status .JobId )
291
300
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
292
301
}
@@ -937,7 +946,7 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
937
946
if err != nil {
938
947
return nil , err
939
948
}
940
- if r .options .BatchSchedulerManager != nil {
949
+ if r .options .BatchSchedulerManager != nil && rayJobInstance . Spec . SubmissionMode == rayv1 . K8sJobMode {
941
950
if scheduler , err := r .options .BatchSchedulerManager .GetScheduler (); err == nil {
942
951
// Group name is only used for individual pods to specify their task group ("headgroup", "worker-group-1", etc.).
943
952
// RayCluster contains multiple groups, so we pass an empty string.
@@ -1050,7 +1059,11 @@ func (r *RayJobReconciler) checkSubmitterAndUpdateStatusIfNeeded(ctx context.Con
1050
1059
}
1051
1060
1052
1061
if headPod == nil {
1053
- logger .Info ("Ray head pod not found, skipping sidecar container status check" )
1062
+ // If head pod is deleted, mark the RayJob as failed
1063
+ shouldUpdate = true
1064
+ rayJob .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusFailed
1065
+ rayJob .Status .Reason = rayv1 .AppFailed
1066
+ rayJob .Status .Message = "Ray head pod not found."
1054
1067
return
1055
1068
}
1056
1069
0 commit comments