Skip to content

Commit 0b32af3

Browse files
authored
Use local copy of RunPolicy by MPI-operator (#513)
* Use local copy of RunPolicy by MPI-operator Steps performed: - copy the `RunPolicy` from common to `types.go` - fix compilation errors by using the local RunPolicy definition - run `make generate` - run `make all` - regenerate openapi_generated.go by `./hack/python-sdk/gen-sdk.sh` (with commented out rollback) * Copy SchedulingPolicy and CleanPodPolicy for RunPolicy
1 parent 382da78 commit 0b32af3

22 files changed

+1010
-92
lines changed

pkg/apis/kubeflow/v2beta1/default.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ func setDefaultsTypeWorker(spec *common.ReplicaSpec) {
4949
}
5050
}
5151

52-
func setDefaultsRunPolicy(policy *common.RunPolicy) {
52+
func setDefaultsRunPolicy(policy *RunPolicy) {
5353
if policy.CleanPodPolicy == nil {
54-
policy.CleanPodPolicy = newCleanPodPolicy(common.CleanPodPolicyNone)
54+
policy.CleanPodPolicy = newCleanPodPolicy(CleanPodPolicyNone)
5555
}
5656
// The remaining fields are passed as-is to the k8s Job API, which does its
5757
// own defaulting.
@@ -80,6 +80,6 @@ func newInt32(v int32) *int32 {
8080
return &v
8181
}
8282

83-
func newCleanPodPolicy(policy common.CleanPodPolicy) *common.CleanPodPolicy {
83+
func newCleanPodPolicy(policy CleanPodPolicy) *CleanPodPolicy {
8484
return &policy
8585
}

pkg/apis/kubeflow/v2beta1/default_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ func TestSetDefaults_MPIJob(t *testing.T) {
3030
want: MPIJob{
3131
Spec: MPIJobSpec{
3232
SlotsPerWorker: newInt32(1),
33-
RunPolicy: common.RunPolicy{
34-
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyNone),
33+
RunPolicy: RunPolicy{
34+
CleanPodPolicy: newCleanPodPolicy(CleanPodPolicyNone),
3535
},
3636
SSHAuthMountPath: "/root/.ssh",
3737
MPIImplementation: MPIImplementationOpenMPI,
@@ -42,8 +42,8 @@ func TestSetDefaults_MPIJob(t *testing.T) {
4242
job: MPIJob{
4343
Spec: MPIJobSpec{
4444
SlotsPerWorker: newInt32(10),
45-
RunPolicy: common.RunPolicy{
46-
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyRunning),
45+
RunPolicy: RunPolicy{
46+
CleanPodPolicy: newCleanPodPolicy(CleanPodPolicyRunning),
4747
TTLSecondsAfterFinished: newInt32(2),
4848
ActiveDeadlineSeconds: newInt64(3),
4949
BackoffLimit: newInt32(4),
@@ -55,8 +55,8 @@ func TestSetDefaults_MPIJob(t *testing.T) {
5555
want: MPIJob{
5656
Spec: MPIJobSpec{
5757
SlotsPerWorker: newInt32(10),
58-
RunPolicy: common.RunPolicy{
59-
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyRunning),
58+
RunPolicy: RunPolicy{
59+
CleanPodPolicy: newCleanPodPolicy(CleanPodPolicyRunning),
6060
TTLSecondsAfterFinished: newInt32(2),
6161
ActiveDeadlineSeconds: newInt64(3),
6262
BackoffLimit: newInt32(4),
@@ -77,8 +77,8 @@ func TestSetDefaults_MPIJob(t *testing.T) {
7777
want: MPIJob{
7878
Spec: MPIJobSpec{
7979
SlotsPerWorker: newInt32(1),
80-
RunPolicy: common.RunPolicy{
81-
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyNone),
80+
RunPolicy: RunPolicy{
81+
CleanPodPolicy: newCleanPodPolicy(CleanPodPolicyNone),
8282
},
8383
SSHAuthMountPath: "/root/.ssh",
8484
MPIImplementation: MPIImplementationOpenMPI,
@@ -102,8 +102,8 @@ func TestSetDefaults_MPIJob(t *testing.T) {
102102
want: MPIJob{
103103
Spec: MPIJobSpec{
104104
SlotsPerWorker: newInt32(1),
105-
RunPolicy: common.RunPolicy{
106-
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyNone),
105+
RunPolicy: RunPolicy{
106+
CleanPodPolicy: newCleanPodPolicy(CleanPodPolicyNone),
107107
},
108108
SSHAuthMountPath: "/root/.ssh",
109109
MPIImplementation: MPIImplementationOpenMPI,

pkg/apis/kubeflow/v2beta1/openapi_generated.go

Lines changed: 115 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/kubeflow/v2beta1/swagger.json

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@
260260
"runPolicy": {
261261
"description": "RunPolicy encapsulates various runtime policies of the job.",
262262
"default": {},
263-
"$ref": "#/definitions/v1.RunPolicy"
263+
"$ref": "#/definitions/v2beta1.RunPolicy"
264264
},
265265
"slotsPerWorker": {
266266
"description": "Specifies the number of slots per worker used in hostfile. Defaults to 1.",
@@ -272,6 +272,62 @@
272272
"type": "string"
273273
}
274274
}
275+
},
276+
"v2beta1.RunPolicy": {
277+
"description": "RunPolicy encapsulates various runtime policies of the distributed training job, for example how to clean up resources and how long the job can stay active.",
278+
"type": "object",
279+
"properties": {
280+
"activeDeadlineSeconds": {
281+
"description": "Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer.",
282+
"type": "integer",
283+
"format": "int64"
284+
},
285+
"backoffLimit": {
286+
"description": "Optional number of retries before marking this job failed.",
287+
"type": "integer",
288+
"format": "int32"
289+
},
290+
"cleanPodPolicy": {
291+
"description": "CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running.",
292+
"type": "string"
293+
},
294+
"schedulingPolicy": {
295+
"description": "SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling",
296+
"$ref": "#/definitions/v2beta1.SchedulingPolicy"
297+
},
298+
"ttlSecondsAfterFinished": {
299+
"description": "TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite.",
300+
"type": "integer",
301+
"format": "int32"
302+
}
303+
}
304+
},
305+
"v2beta1.SchedulingPolicy": {
306+
"description": "SchedulingPolicy encapsulates various scheduling policies of the distributed training job, for example `minAvailable` for gang-scheduling.",
307+
"type": "object",
308+
"properties": {
309+
"minAvailable": {
310+
"type": "integer",
311+
"format": "int32"
312+
},
313+
"minResources": {
314+
"type": "object",
315+
"additionalProperties": {
316+
"default": {},
317+
"$ref": "#/definitions/resource.Quantity"
318+
}
319+
},
320+
"priorityClass": {
321+
"type": "string"
322+
},
323+
"queue": {
324+
"type": "string"
325+
},
326+
"scheduleTimeoutSeconds": {
327+
"type": "integer",
328+
"format": "int32"
329+
}
330+
}
275331
}
276332
}
277333
}

pkg/apis/kubeflow/v2beta1/types.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package v2beta1
1616

1717
import (
1818
common "github.com/kubeflow/common/pkg/apis/common/v1"
19+
v1 "k8s.io/api/core/v1"
1920
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021
)
2122

@@ -37,6 +38,54 @@ type MPIJobList struct {
3738
Items []MPIJob `json:"items"`
3839
}
3940

41+
// CleanPodPolicy describes how to deal with pods when the job is finished.
42+
type CleanPodPolicy string
43+
44+
const (
45+
CleanPodPolicyUndefined CleanPodPolicy = ""
46+
CleanPodPolicyAll CleanPodPolicy = "All"
47+
CleanPodPolicyRunning CleanPodPolicy = "Running"
48+
CleanPodPolicyNone CleanPodPolicy = "None"
49+
)
50+
51+
// SchedulingPolicy encapsulates various scheduling policies of the distributed training
52+
// job, for example `minAvailable` for gang-scheduling.
53+
type SchedulingPolicy struct {
54+
MinAvailable *int32 `json:"minAvailable,omitempty"`
55+
Queue string `json:"queue,omitempty"`
56+
MinResources *v1.ResourceList `json:"minResources,omitempty"`
57+
PriorityClass string `json:"priorityClass,omitempty"`
58+
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
59+
}
60+
61+
// RunPolicy encapsulates various runtime policies of the distributed training
62+
// job, for example how to clean up resources and how long the job can stay
63+
// active.
64+
type RunPolicy struct {
65+
// CleanPodPolicy defines the policy to kill pods after the job completes.
66+
// Default to Running.
67+
CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"`
68+
69+
// TTLSecondsAfterFinished is the TTL to clean up jobs.
70+
// It may take extra ReconcilePeriod seconds for the cleanup, since
71+
// reconcile gets called periodically.
72+
// Default to infinite.
73+
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
74+
75+
// Specifies the duration in seconds relative to the startTime that the job may be active
76+
// before the system tries to terminate it; value must be positive integer.
77+
// +optional
78+
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
79+
80+
// Optional number of retries before marking this job failed.
81+
// +optional
82+
BackoffLimit *int32 `json:"backoffLimit,omitempty"`
83+
84+
// SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling
85+
// +optional
86+
SchedulingPolicy *SchedulingPolicy `json:"schedulingPolicy,omitempty"`
87+
}
88+
4089
type MPIJobSpec struct {
4190

4291
// Specifies the number of slots per worker used in hostfile.
@@ -46,7 +95,7 @@ type MPIJobSpec struct {
4695
SlotsPerWorker *int32 `json:"slotsPerWorker,omitempty"`
4796

4897
// RunPolicy encapsulates various runtime policies of the job.
49-
RunPolicy common.RunPolicy `json:"runPolicy,omitempty"`
98+
RunPolicy RunPolicy `json:"runPolicy,omitempty"`
5099

51100
// MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that
52101
// specify the MPI replicas to run.

0 commit comments

Comments
 (0)