Skip to content

Commit aaf12e6

Browse files
committed
Remarks
1 parent 3cdd581 commit aaf12e6

File tree

6 files changed

+27
-35
lines changed

6 files changed

+27
-35
lines changed

crd/kubeflow.org_mpijobs.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7886,15 +7886,15 @@ spec:
78867886
type: string
78877887
suspend:
78887888
default: false
7889-
description: "suspend specifies whether the Job controller should
7890-
create Pods or not. If a Job is created with suspend set to true,
7891-
no Pods are created by the Job controller. If a Job is suspended
7892-
after creation (i.e. the flag goes from false to true), the Job
7893-
controller will delete all active Pods associated with this Job.
7894-
Users must design their workload to gracefully handle this. Suspending
7895-
a Job will reset the StartTime field of the Job, effectively resetting
7896-
the ActiveDeadlineSeconds timer too. Defaults to false. \n Defaults
7897-
to false."
7889+
description: "suspend specifies whether the MPIJob controller should
7890+
create Pods or not. If a MPIJob is created with suspend set to true,
7891+
no Pods are created by the MPIJob controller. If a MPIJob is suspended
7892+
after creation (i.e. the flag goes from false to true), the MPIJob
7893+
controller will delete all active Pods associated with this MPIJob.
7894+
Also, it will suspend the Launcher Job. Users must design their
7895+
workload to gracefully handle this. Suspending a Job will reset
7896+
the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds
7897+
timer too. Defaults to false. \n Defaults to false."
78987898
type: boolean
78997899
required:
79007900
- mpiReplicaSpecs

pkg/apis/kubeflow/v2beta1/swagger.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@
272272
"type": "string"
273273
},
274274
"suspend": {
275-
"description": "suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
275+
"description": "suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
276276
"type": "boolean"
277277
}
278278
}

pkg/apis/kubeflow/v2beta1/types.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,14 @@ type MPIJobSpec struct {
6363
// +kubebuilder:default:=OpenMPI
6464
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
6565

66-
// suspend specifies whether the Job controller should create Pods or not. If
67-
// a Job is created with suspend set to true, no Pods are created by the Job
68-
// controller. If a Job is suspended after creation (i.e. the flag goes from
69-
// false to true), the Job controller will delete all active Pods associated
70-
// with this Job. Users must design their workload to gracefully handle this.
71-
// Suspending a Job will reset the StartTime field of the Job, effectively
72-
// resetting the ActiveDeadlineSeconds timer too. Defaults to false.
66+
// suspend specifies whether the MPIJob controller should create Pods or not.
67+
// If a MPIJob is created with suspend set to true, no Pods are created by
68+
// the MPIJob controller. If a MPIJob is suspended after creation (i.e. the
69+
// flag goes from false to true), the MPIJob controller will delete all
70+
// active Pods associated with this MPIJob. Also, it will suspend the
71+
// Launcher Job. Users must design their workload to gracefully handle this.
72+
// Suspending a Job will reset the StartTime field of the MPIJob, effectively
73+
// resetting the activeDeadlineSeconds timer too. Defaults to false.
7374
//
7475
// Defaults to false.
7576
// +kubebuilder:default:=false

pkg/controller/mpi_job_controller.go

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,6 @@ func (c *MPIJobController) processNextWorkItem() bool {
450450
// converge the two. It then updates the Status block of the MPIJob resource
451451
// with the current status of the resource.
452452
func (c *MPIJobController) syncHandler(key string) error {
453-
klog.Infof("___ MYDEBUG starting for %s", key)
454453
startTime := time.Now()
455454
defer func() {
456455
klog.Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
@@ -505,7 +504,7 @@ func (c *MPIJobController) syncHandler(key string) error {
505504
// cleanup and stop retrying the MPIJob.
506505
if isFinished(mpiJob.Status) && mpiJob.Status.CompletionTime != nil {
507506
if isCleanUpPods(mpiJob.Spec.RunPolicy.CleanPodPolicy) {
508-
return cleanUpPods(mpiJob, c)
507+
return cleanUpWorkerPods(mpiJob, c)
509508
}
510509
return nil
511510
}
@@ -578,17 +577,9 @@ func (c *MPIJobController) syncHandler(key string) error {
578577
}
579578

580579
if launcher != nil {
581-
launcherSuspendUpdate := false
582-
if isMPIJobSuspended(mpiJob) && !isJobSuspended(launcher) {
583-
// suspend the launcher first if the MPI job is suspended
584-
launcherSuspendUpdate = true
585-
launcher.Spec.Suspend = pointer.Bool(true)
586-
} else if !isMPIJobSuspended(mpiJob) && isJobSuspended(launcher) {
587-
launcherSuspendUpdate = true
588-
// unsuspend the launcher first if the MPI job is unsuspended
589-
launcher.Spec.Suspend = pointer.Bool(false)
590-
}
591-
if launcherSuspendUpdate {
580+
if isMPIJobSuspended(mpiJob) != isJobSuspended(launcher) {
581+
// align the suspension state of launcher with the MPIJob
582+
launcher.Spec.Suspend = pointer.Bool(isMPIJobSuspended(mpiJob))
592583
if _, err := c.kubeClient.BatchV1().Jobs(namespace).Update(context.TODO(), launcher, metav1.UpdateOptions{}); err != nil {
593584
return err
594585
}
@@ -597,14 +588,14 @@ func (c *MPIJobController) syncHandler(key string) error {
597588

598589
// cleanup the running worker pods if the MPI job is suspended
599590
if isMPIJobSuspended(mpiJob) {
600-
if err := cleanUpPods(mpiJob, c); err != nil {
591+
if err := cleanUpWorkerPods(mpiJob, c); err != nil {
601592
return err
602593
}
603594
}
604595
return nil
605596
}
606597

607-
func cleanUpPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
598+
func cleanUpWorkerPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
608599
// set worker StatefulSet Replicas to 0.
609600
if err := c.deleteWorkerPods(mpiJob); err != nil {
610601
return err

sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)