@@ -15,6 +15,7 @@ import (
15
15
"k8s.io/apimachinery/pkg/types"
16
16
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
17
17
"sigs.k8s.io/controller-runtime/pkg/manager"
18
+ "sigs.k8s.io/controller-runtime/pkg/reconcile"
18
19
19
20
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
20
21
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
@@ -92,7 +93,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
92
93
if isJobPendingOrRunning (rayJobInstance .Status .JobStatus ) {
93
94
rayDashboardClient := utils .GetRayDashboardClientFunc ()
94
95
rayDashboardClient .InitClient (rayJobInstance .Status .DashboardURL )
95
- err := rayDashboardClient .StopJob (rayJobInstance .Status .JobId , & r .Log )
96
+ err := rayDashboardClient .StopJob (ctx , rayJobInstance .Status .JobId , & r .Log )
96
97
if err != nil {
97
98
r .Log .Info ("Failed to stop job" , "error" , err )
98
99
}
@@ -150,6 +151,20 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
150
151
err = r .updateState (ctx , rayJobInstance , nil , rayJobInstance .Status .JobStatus , rayv1alpha1 .JobDeploymentStatusFailedToGetOrCreateRayCluster , err )
151
152
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
152
153
}
154
+ // If there is no cluster instance and no error suspend the job deployment
155
+ if rayClusterInstance == nil {
156
+ // Already suspended?
157
+ if rayJobInstance .Status .JobDeploymentStatus == rayv1alpha1 .JobDeploymentStatusSuspended {
158
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
159
+ }
160
+ err = r .updateState (ctx , rayJobInstance , nil , rayJobInstance .Status .JobStatus , rayv1alpha1 .JobDeploymentStatusSuspended , err )
161
+ if err != nil {
162
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
163
+ }
164
+ r .Log .Info ("rayJob suspended" , "RayJob" , rayJobInstance .Name )
165
+ r .Recorder .Eventf (rayJobInstance , corev1 .EventTypeNormal , "Suspended" , "Suspended RayJob %s" , rayJobInstance .Name )
166
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
167
+ }
153
168
154
169
// Always update RayClusterStatus along with jobStatus and jobDeploymentStatus updates.
155
170
rayJobInstance .Status .RayClusterStatus = rayClusterInstance .Status
@@ -178,7 +193,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
178
193
}
179
194
180
195
// Check the current status of ray jobs before submitting.
181
- jobInfo , err := rayDashboardClient .GetJobInfo (rayJobInstance .Status .JobId )
196
+ jobInfo , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
182
197
if err != nil {
183
198
err = r .updateState (ctx , rayJobInstance , jobInfo , rayJobInstance .Status .JobStatus , rayv1alpha1 .JobDeploymentStatusFailedToGetJobStatus , err )
184
199
// Dashboard service in head pod takes time to start, it's possible we get connection refused error.
@@ -189,7 +204,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
189
204
r .Log .V (1 ).Info ("RayJob information" , "RayJob" , rayJobInstance .Name , "jobInfo" , jobInfo , "rayJobInstance" , rayJobInstance .Status .JobStatus )
190
205
if jobInfo == nil {
191
206
// Submit the job if no id set
192
- jobId , err := rayDashboardClient .SubmitJob (rayJobInstance , & r .Log )
207
+ jobId , err := rayDashboardClient .SubmitJob (ctx , rayJobInstance , & r .Log )
193
208
if err != nil {
194
209
r .Log .Error (err , "failed to submit job" )
195
210
err = r .updateState (ctx , rayJobInstance , jobInfo , rayJobInstance .Status .JobStatus , rayv1alpha1 .JobDeploymentStatusFailedJobDeploy , err )
@@ -213,9 +228,48 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
213
228
return ctrl.Result {}, err
214
229
}
215
230
216
- // Job may takes long time to start and finish, let's just periodically requeue the job and check status.
217
- if isJobPendingOrRunning (jobInfo .JobStatus ) && rayJobInstance .Status .JobDeploymentStatus == rayv1alpha1 .JobDeploymentStatusRunning {
218
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
231
+ if rayJobInstance .Status .JobDeploymentStatus == rayv1alpha1 .JobDeploymentStatusRunning {
232
+ // If suspend flag is set AND
233
+ // the RayJob is submitted against the RayCluster created by THIS job, then
234
+ // try to gracefully stop the Ray job and delete (suspend) the cluster
235
+ if rayJobInstance .Spec .Suspend && len (rayJobInstance .Spec .ClusterSelector ) == 0 {
236
+ info , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
237
+ if err != nil {
238
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
239
+ }
240
+ if ! rayv1alpha1 .IsJobTerminal (info .JobStatus ) {
241
+ err := rayDashboardClient .StopJob (ctx , rayJobInstance .Status .JobId , & r .Log )
242
+ if err != nil {
243
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
244
+ }
245
+ }
246
+ if info .JobStatus != rayv1alpha1 .JobStatusStopped {
247
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
248
+ }
249
+
250
+ _ , err = r .deleteCluster (ctx , rayJobInstance )
251
+ if err != nil && ! errors .IsNotFound (err ) {
252
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
253
+ }
254
+ // Since RayCluster instance is gone, remove it status also
255
+ // on RayJob resource
256
+ rayJobInstance .Status .RayClusterStatus = rayv1alpha1.RayClusterStatus {}
257
+ rayJobInstance .Status .RayClusterName = ""
258
+ rayJobInstance .Status .DashboardURL = ""
259
+ rayJobInstance .Status .JobId = ""
260
+ rayJobInstance .Status .Message = ""
261
+ err = r .updateState (ctx , rayJobInstance , jobInfo , rayv1alpha1 .JobStatusStopped , rayv1alpha1 .JobDeploymentStatusSuspended , nil )
262
+ if err != nil {
263
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
264
+ }
265
+ r .Log .Info ("rayJob suspended" , "RayJob" , rayJobInstance .Name )
266
+ r .Recorder .Eventf (rayJobInstance , corev1 .EventTypeNormal , "Suspended" , "Suspended RayJob %s" , rayJobInstance .Name )
267
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
268
+ // Job may takes long time to start and finish, let's just periodically requeue the job and check status.
269
+ }
270
+ if isJobPendingOrRunning (jobInfo .JobStatus ) {
271
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
272
+ }
219
273
}
220
274
221
275
// Let's use rayJobInstance.Status.JobStatus to make sure we only delete cluster after the CR is updated.
@@ -231,34 +285,38 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
231
285
return ctrl.Result {RequeueAfter : time .Duration (delta ) * time .Second }, nil
232
286
}
233
287
}
234
-
235
288
r .Log .Info ("shutdownAfterJobFinishes set to true, we will delete cluster" ,
236
289
"RayJob" , rayJobInstance .Name , "clusterName" , fmt .Sprintf ("%s/%s" , rayJobInstance .Namespace , rayJobInstance .Status .RayClusterName ))
237
- clusterIdentifier := types.NamespacedName {
238
- Name : rayJobInstance .Status .RayClusterName ,
239
- Namespace : rayJobInstance .Namespace ,
240
- }
241
- cluster := rayv1alpha1.RayCluster {}
242
- if err := r .Get (ctx , clusterIdentifier , & cluster ); err != nil {
243
- if ! errors .IsNotFound (err ) {
244
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
245
- }
246
- r .Log .Info ("The associated cluster has been already deleted and it can not be found" , "RayCluster" , clusterIdentifier )
247
- } else {
248
- if cluster .DeletionTimestamp != nil {
249
- r .Log .Info ("The cluster deletion is ongoing." , "rayjob" , rayJobInstance .Name , "raycluster" , cluster .Name )
250
- } else {
251
- if err := r .Delete (ctx , & cluster ); err != nil {
252
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
253
- }
254
- r .Log .Info ("The associated cluster is deleted" , "RayCluster" , clusterIdentifier )
255
- r .Recorder .Eventf (rayJobInstance , corev1 .EventTypeNormal , "Deleted" , "Deleted cluster %s" , rayJobInstance .Status .RayClusterName )
256
- return ctrl.Result {Requeue : true }, nil
257
- }
258
- }
290
+ return r .deleteCluster (ctx , rayJobInstance )
259
291
}
260
292
}
293
+ return ctrl.Result {}, nil
294
+ }
261
295
296
+ func (r * RayJobReconciler ) deleteCluster (ctx context.Context , rayJobInstance * rayv1alpha1.RayJob ) (reconcile.Result , error ) {
297
+ clusterIdentifier := types.NamespacedName {
298
+ Name : rayJobInstance .Status .RayClusterName ,
299
+ Namespace : rayJobInstance .Namespace ,
300
+ }
301
+ cluster := rayv1alpha1.RayCluster {}
302
+ if err := r .Get (ctx , clusterIdentifier , & cluster ); err != nil {
303
+ if ! errors .IsNotFound (err ) {
304
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
305
+ }
306
+ r .Log .Info ("The associated cluster has been already deleted and it can not be found" , "RayCluster" , clusterIdentifier )
307
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
308
+ } else {
309
+ if cluster .DeletionTimestamp != nil {
310
+ r .Log .Info ("The cluster deletion is ongoing." , "rayjob" , rayJobInstance .Name , "raycluster" , cluster .Name )
311
+ } else {
312
+ if err := r .Delete (ctx , & cluster ); err != nil {
313
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
314
+ }
315
+ r .Log .Info ("The associated cluster is deleted" , "RayCluster" , clusterIdentifier )
316
+ r .Recorder .Eventf (rayJobInstance , corev1 .EventTypeNormal , "Deleted" , "Deleted cluster %s" , rayJobInstance .Status .RayClusterName )
317
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
318
+ }
319
+ }
262
320
return ctrl.Result {}, nil
263
321
}
264
322
@@ -343,7 +401,11 @@ func (r *RayJobReconciler) updateState(ctx context.Context, rayJob *rayv1alpha1.
343
401
if jobInfo != nil {
344
402
rayJob .Status .Message = jobInfo .Message
345
403
rayJob .Status .StartTime = utils .ConvertUnixTimeToMetav1Time (jobInfo .StartTime )
346
- rayJob .Status .EndTime = utils .ConvertUnixTimeToMetav1Time (jobInfo .EndTime )
404
+ if jobInfo .StartTime >= jobInfo .EndTime {
405
+ rayJob .Status .EndTime = nil
406
+ } else {
407
+ rayJob .Status .EndTime = utils .ConvertUnixTimeToMetav1Time (jobInfo .EndTime )
408
+ }
347
409
}
348
410
349
411
// TODO (kevin85421): ObservedGeneration should be used to determine whether update this CR or not.
@@ -391,11 +453,15 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
391
453
return nil , err
392
454
}
393
455
394
- // one special case is the job is complete status and cluster has been recycled.
456
+ // special case: is the job is complete status and cluster has been recycled.
395
457
if isJobSucceedOrFailed (rayJobInstance .Status .JobStatus ) && rayJobInstance .Status .JobDeploymentStatus == rayv1alpha1 .JobDeploymentStatusComplete {
396
458
r .Log .Info ("The cluster has been recycled for the job, skip duplicate creation" , "rayjob" , rayJobInstance .Name )
397
459
return nil , err
398
460
}
461
+ // special case: don't create a cluster instance and don't return an error if the suspend flag of the job is true
462
+ if rayJobInstance .Spec .Suspend {
463
+ return nil , nil
464
+ }
399
465
400
466
r .Log .Info ("RayCluster not found, creating rayCluster!" , "raycluster" , rayClusterNamespacedName )
401
467
rayClusterInstance , err = r .constructRayClusterForRayJob (rayJobInstance , rayClusterInstanceName )
0 commit comments