Skip to content

Commit 4e96e9a

Browse files
committed
Implement support for suspend semantics for MPIJob
1 parent c5a0c32 commit 4e96e9a

File tree

12 files changed

+193
-65
lines changed

12 files changed

+193
-65
lines changed

crd/kubeflow.org_mpijobs.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7866,6 +7866,18 @@ spec:
78667866
format: int32
78677867
type: integer
78687868
type: object
7869+
suspend:
7870+
default: false
7871+
description: "suspend specifies whether the MPIJob controller
7872+
should create Pods or not. If a MPIJob is created with suspend
7873+
set to true, no Pods are created by the MPIJob controller. If
7874+
a MPIJob is suspended after creation (i.e. the flag goes from
7875+
false to true), the MPIJob controller will delete all active
7876+
Pods associated with this MPIJob. Also, it will suspend the
7877+
Launcher Job. Users must design their workload to gracefully
7878+
handle this. Suspending a Job will reset the StartTime field
7879+
of the MPIJob. \n Defaults to false."
7880+
type: boolean
78697881
ttlSecondsAfterFinished:
78707882
description: TTLSecondsAfterFinished is the TTL to clean up jobs.
78717883
It may take extra ReconcilePeriod seconds for the cleanup, since

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ require (
1717
k8s.io/klog v1.0.0
1818
k8s.io/kube-openapi v0.0.0-20230109183929-3758b55a6596
1919
k8s.io/sample-controller v0.25.6
20+
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed
2021
sigs.k8s.io/controller-runtime v0.13.1
2122
volcano.sh/apis v1.7.0
2223
)
@@ -73,7 +74,6 @@ require (
7374
k8s.io/component-base v0.25.6 // indirect
7475
k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 // indirect
7576
k8s.io/klog/v2 v2.70.1 // indirect
76-
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
7777
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
7878
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
7979
sigs.k8s.io/yaml v1.3.0 // indirect

manifests/base/crd.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ spec:
4949
type: integer
5050
minimum: 0
5151
description: "Specifies the number of retries before marking the launcher Job as failed. Defaults to 6."
52+
suspend:
53+
type: boolean
5254
sshAuthMountPath:
5355
type: string
5456
mpiImplementation:
@@ -94,7 +96,7 @@ spec:
9496
properties:
9597
type:
9698
type: string
97-
enum: ["Created", "Running", "Restarting", "Succeeded", "Failed"]
99+
enum: ["Created", "Running", "Restarting", "Succeeded", "Suspended", "Failed"]
98100
status:
99101
type: string
100102
enum: ["True", "False", "Unknown"]

pkg/apis/kubeflow/v2beta1/openapi_generated.go

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/kubeflow/v2beta1/swagger.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,10 @@
295295
"description": "SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling",
296296
"$ref": "#/definitions/v2beta1.SchedulingPolicy"
297297
},
298+
"suspend": {
299+
"description": "suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob.\n\nDefaults to false.",
300+
"type": "boolean"
301+
},
298302
"ttlSecondsAfterFinished": {
299303
"description": "TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite.",
300304
"type": "integer",

pkg/apis/kubeflow/v2beta1/types.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,18 @@ type RunPolicy struct {
8484
// SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling
8585
// +optional
8686
SchedulingPolicy *SchedulingPolicy `json:"schedulingPolicy,omitempty"`
87+
88+
// suspend specifies whether the MPIJob controller should create Pods or not.
89+
// If a MPIJob is created with suspend set to true, no Pods are created by
90+
// the MPIJob controller. If a MPIJob is suspended after creation (i.e. the
91+
// flag goes from false to true), the MPIJob controller will delete all
92+
// active Pods associated with this MPIJob. Also, it will suspend the
93+
// Launcher Job. Users must design their workload to gracefully handle this.
94+
// Suspending a Job will reset the StartTime field of the MPIJob.
95+
//
96+
// Defaults to false.
97+
// +kubebuilder:default:=false
98+
Suspend *bool `json:"suspend,omitempty"`
8799
}
88100

89101
type MPIJobSpec struct {
@@ -130,3 +142,8 @@ const (
130142
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
131143
MPIImplementationIntel MPIImplementation = "Intel"
132144
)
145+
146+
const (
147+
// JobSuspended means the job has been suspended.
148+
JobSuspended common.JobConditionType = "Suspended"
149+
)

pkg/apis/kubeflow/v2beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/controller/mpi_job_controller.go

Lines changed: 94 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"golang.org/x/crypto/ssh"
3434
batchv1 "k8s.io/api/batch/v1"
3535
corev1 "k8s.io/api/core/v1"
36+
v1 "k8s.io/api/core/v1"
3637
"k8s.io/apimachinery/pkg/api/equality"
3738
"k8s.io/apimachinery/pkg/api/errors"
3839
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -50,6 +51,7 @@ import (
5051
"k8s.io/client-go/tools/record"
5152
"k8s.io/client-go/util/workqueue"
5253
"k8s.io/klog"
54+
"k8s.io/utils/pointer"
5355
podgroupv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
5456
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
5557
podgroupsinformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
@@ -493,7 +495,7 @@ func (c *MPIJobController) syncHandler(key string) error {
493495

494496
if len(mpiJob.Status.Conditions) == 0 {
495497
msg := fmt.Sprintf("MPIJob %s/%s is created.", mpiJob.Namespace, mpiJob.Name)
496-
updateMPIJobConditions(mpiJob, common.JobCreated, mpiJobCreatedReason, msg)
498+
updateMPIJobConditions(mpiJob, common.JobCreated, v1.ConditionTrue, mpiJobCreatedReason, msg)
497499
c.recorder.Event(mpiJob, corev1.EventTypeNormal, "MPIJobCreated", msg)
498500
mpiJobsCreatedCount.Inc()
499501
}
@@ -503,24 +505,13 @@ func (c *MPIJobController) syncHandler(key string) error {
503505
// cleanup and stop retrying the MPIJob.
504506
if isFinished(mpiJob.Status) && mpiJob.Status.CompletionTime != nil {
505507
if isCleanUpPods(mpiJob.Spec.RunPolicy.CleanPodPolicy) {
506-
// set worker StatefulSet Replicas to 0.
507-
if err := c.deleteWorkerPods(mpiJob); err != nil {
508-
return err
509-
}
510-
initializeMPIJobStatuses(mpiJob, kubeflow.MPIReplicaTypeWorker)
511-
if c.gangSchedulerName != "" {
512-
if err := c.deletePodGroups(mpiJob); err != nil {
513-
return err
514-
}
515-
}
516-
mpiJob.Status.ReplicaStatuses[common.ReplicaType(kubeflow.MPIReplicaTypeWorker)].Active = 0
517-
return c.updateStatusHandler(mpiJob)
508+
return cleanUpWorkerPods(mpiJob, c)
518509
}
519510
return nil
520511
}
521512

522513
// first set StartTime.
523-
if mpiJob.Status.StartTime == nil {
514+
if mpiJob.Status.StartTime == nil && !isMPIJobSuspended(mpiJob) {
524515
now := metav1.Now()
525516
mpiJob.Status.StartTime = &now
526517
}
@@ -535,38 +526,40 @@ func (c *MPIJobController) syncHandler(key string) error {
535526
// We're done if the launcher either succeeded or failed.
536527
done := launcher != nil && isJobFinished(launcher)
537528
if !done {
538-
_, err := c.getOrCreateService(mpiJob, newWorkersService(mpiJob))
539-
if err != nil {
540-
return fmt.Errorf("getting or creating Service to front workers: %w", err)
541-
}
529+
if !isMPIJobSuspended(mpiJob) {
530+
_, err := c.getOrCreateService(mpiJob, newWorkersService(mpiJob))
531+
if err != nil {
532+
return fmt.Errorf("getting or creating Service to front workers: %w", err)
533+
}
542534

543-
if config, err := c.getOrCreateConfigMap(mpiJob); config == nil || err != nil {
544-
return fmt.Errorf("getting or creating ConfigMap: %w", err)
545-
}
535+
if config, err := c.getOrCreateConfigMap(mpiJob); config == nil || err != nil {
536+
return fmt.Errorf("getting or creating ConfigMap: %w", err)
537+
}
546538

547-
_, err = c.getOrCreateSSHAuthSecret(mpiJob)
548-
if err != nil {
549-
return fmt.Errorf("creating SSH auth secret: %w", err)
550-
}
539+
_, err = c.getOrCreateSSHAuthSecret(mpiJob)
540+
if err != nil {
541+
return fmt.Errorf("creating SSH auth secret: %w", err)
542+
}
551543

552-
// Get the PodGroup for this MPIJob
553-
if c.gangSchedulerName != "" {
554-
if podgroup, err := c.getOrCreatePodGroups(mpiJob, workerReplicas(mpiJob)+1); podgroup == nil || err != nil {
555-
return err
544+
// Get the PodGroup for this MPIJob
545+
if c.gangSchedulerName != "" {
546+
if podgroup, err := c.getOrCreatePodGroups(mpiJob, workerReplicas(mpiJob)+1); podgroup == nil || err != nil {
547+
return err
548+
}
556549
}
557-
}
558550

559-
worker, err = c.getOrCreateWorker(mpiJob)
560-
if err != nil {
561-
return err
562-
}
563-
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel {
564-
// The Intel implementation requires workers to communicate with the
565-
// launcher through its hostname. For that, we create a Service which
566-
// has the same name as the launcher's hostname.
567-
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
551+
worker, err = c.getOrCreateWorker(mpiJob)
568552
if err != nil {
569-
return fmt.Errorf("getting or creating Service to front launcher: %w", err)
553+
return err
554+
}
555+
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel {
556+
// The Intel implementation requires workers to communicate with the
557+
// launcher through its hostname. For that, we create a Service which
558+
// has the same name as the launcher's hostname.
559+
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
560+
if err != nil {
561+
return fmt.Errorf("getting or creating Service to front launcher: %w", err)
562+
}
570563
}
571564
}
572565
if launcher == nil {
@@ -585,9 +578,40 @@ func (c *MPIJobController) syncHandler(key string) error {
585578
return err
586579
}
587580

581+
if launcher != nil {
582+
if isMPIJobSuspended(mpiJob) != isJobSuspended(launcher) {
583+
// align the suspension state of launcher with the MPIJob
584+
launcher.Spec.Suspend = pointer.Bool(isMPIJobSuspended(mpiJob))
585+
if _, err := c.kubeClient.BatchV1().Jobs(namespace).Update(context.TODO(), launcher, metav1.UpdateOptions{}); err != nil {
586+
return err
587+
}
588+
}
589+
}
590+
591+
// cleanup the running worker pods if the MPI job is suspended
592+
if isMPIJobSuspended(mpiJob) {
593+
if err := cleanUpWorkerPods(mpiJob, c); err != nil {
594+
return err
595+
}
596+
}
588597
return nil
589598
}
590599

600+
func cleanUpWorkerPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
601+
// set worker StatefulSet Replicas to 0.
602+
if err := c.deleteWorkerPods(mpiJob); err != nil {
603+
return err
604+
}
605+
initializeMPIJobStatuses(mpiJob, kubeflow.MPIReplicaTypeWorker)
606+
if c.gangSchedulerName != "" {
607+
if err := c.deletePodGroups(mpiJob); err != nil {
608+
return err
609+
}
610+
}
611+
mpiJob.Status.ReplicaStatuses[common.ReplicaType(kubeflow.MPIReplicaTypeWorker)].Active = 0
612+
return c.updateStatusHandler(mpiJob)
613+
}
614+
591615
// getLauncherJob gets the launcher Job controlled by this MPIJob.
592616
func (c *MPIJobController) getLauncherJob(mpiJob *kubeflow.MPIJob) (*batchv1.Job, error) {
593617
launcher, err := c.jobLister.Jobs(mpiJob.Namespace).Get(mpiJob.Name + launcherSuffix)
@@ -857,6 +881,14 @@ func (c *MPIJobController) getOrCreateWorker(mpiJob *kubeflow.MPIJob) ([]*corev1
857881
return workerPods, nil
858882
}
859883

884+
func isMPIJobSuspended(mpiJob *kubeflow.MPIJob) bool {
885+
return pointer.BoolDeref(mpiJob.Spec.RunPolicy.Suspend, false)
886+
}
887+
888+
func isJobSuspended(job *batchv1.Job) bool {
889+
return pointer.BoolDeref(job.Spec.Suspend, false)
890+
}
891+
860892
func (c *MPIJobController) deleteWorkerPods(mpiJob *kubeflow.MPIJob) error {
861893
var (
862894
workerPrefix = mpiJob.Name + workerSuffix
@@ -905,6 +937,19 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
905937
if err != nil {
906938
return fmt.Errorf("checking launcher pods running: %w", err)
907939
}
940+
if isMPIJobSuspended(mpiJob) {
941+
// it is suspended now
942+
if updateMPIJobConditions(mpiJob, kubeflow.JobSuspended, v1.ConditionTrue, "MPIJobSuspended", "MPIJob suspended") {
943+
c.recorder.Event(mpiJob, corev1.EventTypeNormal, "Suspended", "MPIJob suspended")
944+
}
945+
} else if getCondition(mpiJob.Status, kubeflow.JobSuspended) != nil {
946+
// it is not suspended now, consider resumed if the condition was set before
947+
if updateMPIJobConditions(mpiJob, kubeflow.JobSuspended, v1.ConditionTrue, "MPIJobResumed", "MPIJob resumed") {
948+
c.recorder.Event(mpiJob, corev1.EventTypeNormal, "Resumed", "MPIJob resumed")
949+
now := metav1.NewTime(time.Now())
950+
mpiJob.Status.StartTime = &now
951+
}
952+
}
908953
// Job.status.Active accounts for Pending and Running pods. Count running pods
909954
// from the lister instead.
910955
launcherPodsCnt := countRunningPods(launcherPods)
@@ -919,7 +964,7 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
919964
if mpiJob.Status.CompletionTime == nil {
920965
mpiJob.Status.CompletionTime = launcher.Status.CompletionTime
921966
}
922-
updateMPIJobConditions(mpiJob, common.JobSucceeded, mpiJobSucceededReason, msg)
967+
updateMPIJobConditions(mpiJob, common.JobSucceeded, v1.ConditionTrue, mpiJobSucceededReason, msg)
923968
mpiJobsSuccessCount.Inc()
924969
} else if isJobFailed(launcher) {
925970
c.updateMPIJobFailedStatus(mpiJob, launcher, launcherPods)
@@ -953,13 +998,13 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
953998
if evict > 0 {
954999
msg := fmt.Sprintf("%d/%d workers are evicted", evict, len(worker))
9551000
klog.Infof("MPIJob <%s/%s>: %v", mpiJob.Namespace, mpiJob.Name, msg)
956-
updateMPIJobConditions(mpiJob, common.JobFailed, mpiJobEvict, msg)
1001+
updateMPIJobConditions(mpiJob, common.JobFailed, v1.ConditionTrue, mpiJobEvict, msg)
9571002
c.recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobEvict, msg)
9581003
}
9591004

9601005
if launcher != nil && launcherPodsCnt >= 1 && running == len(worker) {
9611006
msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name)
962-
updateMPIJobConditions(mpiJob, common.JobRunning, mpiJobRunningReason, msg)
1007+
updateMPIJobConditions(mpiJob, common.JobRunning, v1.ConditionTrue, mpiJobRunningReason, msg)
9631008
c.recorder.Eventf(mpiJob, corev1.EventTypeNormal, "MPIJobRunning", "MPIJob %s/%s is running", mpiJob.Namespace, mpiJob.Name)
9641009
}
9651010

@@ -999,7 +1044,7 @@ func (c *MPIJobController) updateMPIJobFailedStatus(mpiJob *kubeflow.MPIJob, lau
9991044
now := metav1.Now()
10001045
mpiJob.Status.CompletionTime = &now
10011046
}
1002-
updateMPIJobConditions(mpiJob, common.JobFailed, reason, msg)
1047+
updateMPIJobConditions(mpiJob, common.JobFailed, v1.ConditionTrue, reason, msg)
10031048
mpiJobsFailureCount.Inc()
10041049
}
10051050

@@ -1304,7 +1349,7 @@ func (c *MPIJobController) newWorker(mpiJob *kubeflow.MPIJob, index int) *corev1
13041349
}
13051350

13061351
func (c *MPIJobController) newLauncherJob(mpiJob *kubeflow.MPIJob) *batchv1.Job {
1307-
return &batchv1.Job{
1352+
job := &batchv1.Job{
13081353
ObjectMeta: metav1.ObjectMeta{
13091354
Name: mpiJob.Name + launcherSuffix,
13101355
Namespace: mpiJob.Namespace,
@@ -1322,6 +1367,10 @@ func (c *MPIJobController) newLauncherJob(mpiJob *kubeflow.MPIJob) *batchv1.Job
13221367
Template: c.newLauncherPodTemplate(mpiJob),
13231368
},
13241369
}
1370+
if isMPIJobSuspended(mpiJob) {
1371+
job.Spec.Suspend = pointer.Bool(true)
1372+
}
1373+
return job
13251374
}
13261375

13271376
// newLauncherPodTemplate creates a new launcher Job for an MPIJob resource. It also sets

0 commit comments

Comments
 (0)