Skip to content

Commit da1e019

Browse files
committed
Remarks
1 parent 3cdd581 commit da1e019

File tree

6 files changed

+27
-34
lines changed

6 files changed

+27
-34
lines changed

crd/kubeflow.org_mpijobs.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7886,15 +7886,15 @@ spec:
78867886
type: string
78877887
suspend:
78887888
default: false
7889-
description: "suspend specifies whether the Job controller should
7890-
create Pods or not. If a Job is created with suspend set to true,
7891-
no Pods are created by the Job controller. If a Job is suspended
7892-
after creation (i.e. the flag goes from false to true), the Job
7893-
controller will delete all active Pods associated with this Job.
7894-
Users must design their workload to gracefully handle this. Suspending
7895-
a Job will reset the StartTime field of the Job, effectively resetting
7896-
the ActiveDeadlineSeconds timer too. Defaults to false. \n Defaults
7897-
to false."
7889+
description: "suspend specifies whether the MPIJob controller should
7890+
create Pods or not. If a MPIJob is created with suspend set to true,
7891+
no Pods are created by the MPIJob controller. If a MPIJob is suspended
7892+
after creation (i.e. the flag goes from false to true), the MPIJob
7893+
controller will delete all active Pods associated with this MPIJob.
7894+
Also, it will suspend the Launcher Job. Users must design their
7895+
workload to gracefully handle this. Suspending a Job will reset
7896+
the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds
7897+
timer too. Defaults to false. \n Defaults to false."
78987898
type: boolean
78997899
required:
79007900
- mpiReplicaSpecs

pkg/apis/kubeflow/v2beta1/swagger.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@
272272
"type": "string"
273273
},
274274
"suspend": {
275-
"description": "suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
275+
"description": "suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
276276
"type": "boolean"
277277
}
278278
}

pkg/apis/kubeflow/v2beta1/types.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,14 @@ type MPIJobSpec struct {
6363
// +kubebuilder:default:=OpenMPI
6464
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
6565

66-
// suspend specifies whether the Job controller should create Pods or not. If
67-
// a Job is created with suspend set to true, no Pods are created by the Job
68-
// controller. If a Job is suspended after creation (i.e. the flag goes from
69-
// false to true), the Job controller will delete all active Pods associated
70-
// with this Job. Users must design their workload to gracefully handle this.
71-
// Suspending a Job will reset the StartTime field of the Job, effectively
72-
// resetting the ActiveDeadlineSeconds timer too. Defaults to false.
66+
// suspend specifies whether the MPIJob controller should create Pods or not.
67+
// If a MPIJob is created with suspend set to true, no Pods are created by
68+
// the MPIJob controller. If a MPIJob is suspended after creation (i.e. the
69+
// flag goes from false to true), the MPIJob controller will delete all
70+
// active Pods associated with this MPIJob. Also, it will suspend the
71+
// Launcher Job. Users must design their workload to gracefully handle this.
72+
// Suspending a Job will reset the StartTime field of the MPIJob, effectively
73+
// resetting the activeDeadlineSeconds timer too. Defaults to false.
7374
//
7475
// Defaults to false.
7576
// +kubebuilder:default:=false

pkg/controller/mpi_job_controller.go

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ func (c *MPIJobController) syncHandler(key string) error {
505505
// cleanup and stop retrying the MPIJob.
506506
if isFinished(mpiJob.Status) && mpiJob.Status.CompletionTime != nil {
507507
if isCleanUpPods(mpiJob.Spec.RunPolicy.CleanPodPolicy) {
508-
return cleanUpPods(mpiJob, c)
508+
return cleanUpWorkerPods(mpiJob, c)
509509
}
510510
return nil
511511
}
@@ -578,17 +578,9 @@ func (c *MPIJobController) syncHandler(key string) error {
578578
}
579579

580580
if launcher != nil {
581-
launcherSuspendUpdate := false
582-
if isMPIJobSuspended(mpiJob) && !isJobSuspended(launcher) {
583-
// suspend the launcher first if the MPI job is suspended
584-
launcherSuspendUpdate = true
585-
launcher.Spec.Suspend = pointer.Bool(true)
586-
} else if !isMPIJobSuspended(mpiJob) && isJobSuspended(launcher) {
587-
launcherSuspendUpdate = true
588-
// unsuspend the launcher first if the MPI job is unsuspended
589-
launcher.Spec.Suspend = pointer.Bool(false)
590-
}
591-
if launcherSuspendUpdate {
581+
if isMPIJobSuspended(mpiJob) != isJobSuspended(launcher) {
582+
// align the suspension state of launcher with the MPIJob
583+
launcher.Spec.Suspend = pointer.Bool(isMPIJobSuspended(mpiJob))
592584
if _, err := c.kubeClient.BatchV1().Jobs(namespace).Update(context.TODO(), launcher, metav1.UpdateOptions{}); err != nil {
593585
return err
594586
}
@@ -597,14 +589,14 @@ func (c *MPIJobController) syncHandler(key string) error {
597589

598590
// cleanup the running worker pods if the MPI job is suspended
599591
if isMPIJobSuspended(mpiJob) {
600-
if err := cleanUpPods(mpiJob, c); err != nil {
592+
if err := cleanUpWorkerPods(mpiJob, c); err != nil {
601593
return err
602594
}
603595
}
604596
return nil
605597
}
606598

607-
func cleanUpPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
599+
func cleanUpWorkerPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
608600
// set worker StatefulSet Replicas to 0.
609601
if err := c.deleteWorkerPods(mpiJob); err != nil {
610602
return err

sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)