Skip to content

Commit 4c8b4fc

Browse files
authored
Use local copy of JobStatus by mpi-operator (#514)
* Use local copy of JobStatus by mpi-operator Signed-off-by: Yuki Iwai <[email protected]> * address comments Signed-off-by: Yuki Iwai <[email protected]> --------- Signed-off-by: Yuki Iwai <[email protected]>
1 parent 0b32af3 commit 4c8b4fc

23 files changed

+1410
-116
lines changed

crd/kubeflow.org_mpijobs.yaml

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7898,7 +7898,7 @@ spec:
78987898
format: date-time
78997899
type: string
79007900
conditions:
7901-
description: Conditions is an array of current observed job conditions.
7901+
description: conditions is a list of current observed job conditions.
79027902
items:
79037903
description: JobCondition describes the state of the job at a certain
79047904
point.
@@ -7913,23 +7913,30 @@ spec:
79137913
format: date-time
79147914
type: string
79157915
message:
7916-
description: A human readable message indicating details about
7916+
description: A human-readable message indicating details about
79177917
the transition.
79187918
type: string
79197919
reason:
79207920
description: The reason for the condition's last transition.
79217921
type: string
79227922
status:
7923-
description: Status of the condition, one of True, False, Unknown.
7923+
description: status of the condition, one of True, False, Unknown.
7924+
enum:
7925+
- "True"
7926+
- "False"
7927+
- Unknown
79247928
type: string
79257929
type:
7926-
description: Type of job condition.
7930+
description: type of job condition.
79277931
type: string
79287932
required:
79297933
- status
79307934
- type
79317935
type: object
79327936
type: array
7937+
x-kubernetes-list-map-keys:
7938+
- type
7939+
x-kubernetes-list-type: map
79337940
lastReconcileTime:
79347941
description: Represents last time when the job was reconciled. It
79357942
is not guaranteed to be set in happens-before order across separate
@@ -7946,11 +7953,11 @@ spec:
79467953
format: int32
79477954
type: integer
79487955
failed:
7949-
description: The number of pods which reached phase Failed.
7956+
description: The number of pods which reached phase failed.
79507957
format: int32
79517958
type: integer
79527959
labelSelector:
7953-
description: 'Deprecated: Use Selector instead'
7960+
description: 'Deprecated: Use selector instead'
79547961
properties:
79557962
matchExpressions:
79567963
description: matchExpressions is a list of label selector
@@ -7995,17 +8002,17 @@ spec:
79958002
type: object
79968003
x-kubernetes-map-type: atomic
79978004
selector:
7998-
description: A Selector is a label query over a set of resources.
8005+
description: A selector is a label query over a set of resources.
79998006
The result of matchLabels and matchExpressions are ANDed.
8000-
An empty Selector matches all objects. A null Selector matches
8007+
An empty selector matches all objects. A null selector matches
80018008
no objects.
80028009
type: string
80038010
succeeded:
8004-
description: The number of pods which reached phase Succeeded.
8011+
description: The number of pods which reached phase succeeded.
80058012
format: int32
80068013
type: integer
80078014
type: object
8008-
description: ReplicaStatuses is map of ReplicaType and ReplicaStatus,
8015+
description: replicaStatuses is map of ReplicaType and ReplicaStatus,
80098016
specifies the status of each replica.
80108017
type: object
80118018
startTime:
@@ -8015,9 +8022,6 @@ spec:
80158022
and is in UTC.
80168023
format: date-time
80178024
type: string
8018-
required:
8019-
- conditions
8020-
- replicaStatuses
80218025
type: object
80228026
type: object
80238027
served: true

pkg/apis/kubeflow/v2beta1/swagger.json

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,81 @@
187187
}
188188
}
189189
},
190+
"v2beta1.JobCondition": {
191+
"description": "JobCondition describes the state of the job at a certain point.",
192+
"type": "object",
193+
"required": [
194+
"type",
195+
"status"
196+
],
197+
"properties": {
198+
"lastTransitionTime": {
199+
"description": "Last time the condition transitioned from one status to another.",
200+
"default": {},
201+
"$ref": "#/definitions/v1.Time"
202+
},
203+
"lastUpdateTime": {
204+
"description": "The last time this condition was updated.",
205+
"default": {},
206+
"$ref": "#/definitions/v1.Time"
207+
},
208+
"message": {
209+
"description": "A human-readable message indicating details about the transition.",
210+
"type": "string"
211+
},
212+
"reason": {
213+
"description": "The reason for the condition's last transition.",
214+
"type": "string"
215+
},
216+
"status": {
217+
"description": "status of the condition, one of True, False, Unknown.",
218+
"type": "string",
219+
"default": ""
220+
},
221+
"type": {
222+
"description": "type of job condition.",
223+
"type": "string",
224+
"default": ""
225+
}
226+
}
227+
},
228+
"v2beta1.JobStatus": {
229+
"description": "JobStatus represents the current observed state of the training Job.",
230+
"type": "object",
231+
"properties": {
232+
"completionTime": {
233+
"description": "Represents time when the job was completed. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.",
234+
"$ref": "#/definitions/v1.Time"
235+
},
236+
"conditions": {
237+
"description": "conditions is a list of current observed job conditions.",
238+
"type": "array",
239+
"items": {
240+
"default": {},
241+
"$ref": "#/definitions/v2beta1.JobCondition"
242+
},
243+
"x-kubernetes-list-map-keys": [
244+
"type"
245+
],
246+
"x-kubernetes-list-type": "map"
247+
},
248+
"lastReconcileTime": {
249+
"description": "Represents last time when the job was reconciled. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.",
250+
"$ref": "#/definitions/v1.Time"
251+
},
252+
"replicaStatuses": {
253+
"description": "replicaStatuses is map of ReplicaType and ReplicaStatus, specifies the status of each replica.",
254+
"type": "object",
255+
"additionalProperties": {
256+
"$ref": "#/definitions/v2beta1.ReplicaStatus"
257+
}
258+
},
259+
"startTime": {
260+
"description": "Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.",
261+
"$ref": "#/definitions/v1.Time"
262+
}
263+
}
264+
},
190265
"v2beta1.MPIJob": {
191266
"type": "object",
192267
"properties": {
@@ -208,7 +283,7 @@
208283
},
209284
"status": {
210285
"default": {},
211-
"$ref": "#/definitions/v1.JobStatus"
286+
"$ref": "#/definitions/v2beta1.JobStatus"
212287
}
213288
}
214289
},
@@ -273,6 +348,35 @@
273348
}
274349
}
275350
},
351+
"v2beta1.ReplicaStatus": {
352+
"description": "ReplicaStatus represents the current observed state of the replica.",
353+
"type": "object",
354+
"properties": {
355+
"active": {
356+
"description": "The number of actively running pods.",
357+
"type": "integer",
358+
"format": "int32"
359+
},
360+
"failed": {
361+
"description": "The number of pods which reached phase failed.",
362+
"type": "integer",
363+
"format": "int32"
364+
},
365+
"labelSelector": {
366+
"description": "Deprecated: Use selector instead",
367+
"$ref": "#/definitions/v1.LabelSelector"
368+
},
369+
"selector": {
370+
"description": "A selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty selector matches all objects. A null selector matches no objects.",
371+
"type": "string"
372+
},
373+
"succeeded": {
374+
"description": "The number of pods which reached phase succeeded.",
375+
"type": "integer",
376+
"format": "int32"
377+
}
378+
}
379+
},
276380
"v2beta1.RunPolicy": {
277381
"description": "RunPolicy encapsulates various runtime policies of the distributed training job, for example how to clean up resources and how long the job can stay active.",
278382
"type": "object",

pkg/apis/kubeflow/v2beta1/types.go

Lines changed: 117 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ import (
2626
type MPIJob struct {
2727
metav1.TypeMeta `json:",inline"`
2828
metav1.ObjectMeta `json:"metadata,omitempty"`
29-
Spec MPIJobSpec `json:"spec,omitempty"`
30-
Status common.JobStatus `json:"status,omitempty"`
29+
Spec MPIJobSpec `json:"spec,omitempty"`
30+
Status JobStatus `json:"status,omitempty"`
3131
}
3232

3333
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@@ -114,7 +114,7 @@ type MPIJobSpec struct {
114114
}
115115

116116
// MPIReplicaType is the type for MPIReplica.
117-
type MPIReplicaType common.ReplicaType
117+
type MPIReplicaType string
118118

119119
const (
120120
// MPIReplicaTypeLauncher is the type for launcher replica.
@@ -130,3 +130,117 @@ const (
130130
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
131131
MPIImplementationIntel MPIImplementation = "Intel"
132132
)
133+
134+
// JobStatus represents the current observed state of the training Job.
135+
type JobStatus struct {
136+
// conditions is a list of current observed job conditions.
137+
// +optional
138+
// +listType=map
139+
// +listMapKey=type
140+
Conditions []JobCondition `json:"conditions,omitempty"`
141+
142+
// replicaStatuses is map of ReplicaType and ReplicaStatus,
143+
// specifies the status of each replica.
144+
// +optional
145+
ReplicaStatuses map[MPIReplicaType]*ReplicaStatus `json:"replicaStatuses,omitempty"`
146+
147+
// Represents time when the job was acknowledged by the job controller.
148+
// It is not guaranteed to be set in happens-before order across separate operations.
149+
// It is represented in RFC3339 form and is in UTC.
150+
// +optional
151+
StartTime *metav1.Time `json:"startTime,omitempty"`
152+
153+
// Represents time when the job was completed. It is not guaranteed to
154+
// be set in happens-before order across separate operations.
155+
// It is represented in RFC3339 form and is in UTC.
156+
// +optional
157+
CompletionTime *metav1.Time `json:"completionTime,omitempty"`
158+
159+
// Represents last time when the job was reconciled. It is not guaranteed to
160+
// be set in happens-before order across separate operations.
161+
// It is represented in RFC3339 form and is in UTC.
162+
// +optional
163+
LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"`
164+
}
165+
166+
// ReplicaStatus represents the current observed state of the replica.
167+
type ReplicaStatus struct {
168+
// The number of actively running pods.
169+
// +optional
170+
Active int32 `json:"active,omitempty"`
171+
172+
// The number of pods which reached phase succeeded.
173+
// +optional
174+
Succeeded int32 `json:"succeeded,omitempty"`
175+
176+
// The number of pods which reached phase failed.
177+
// +optional
178+
Failed int32 `json:"failed,omitempty"`
179+
180+
// Deprecated: Use selector instead
181+
// +optional
182+
LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"`
183+
184+
// A selector is a label query over a set of resources. The result of matchLabels and
185+
// matchExpressions are ANDed. An empty selector matches all objects. A null
186+
// selector matches no objects.
187+
// +optional
188+
Selector string `json:"selector,omitempty"`
189+
}
190+
191+
// JobCondition describes the state of the job at a certain point.
192+
type JobCondition struct {
193+
// type of job condition.
194+
Type JobConditionType `json:"type"`
195+
196+
// status of the condition, one of True, False, Unknown.
197+
// +kubebuilder:validation:Enum:=True;False;Unknown
198+
Status v1.ConditionStatus `json:"status"`
199+
200+
// The reason for the condition's last transition.
201+
// +optional
202+
Reason string `json:"reason,omitempty"`
203+
204+
// A human-readable message indicating details about the transition.
205+
// +optional
206+
Message string `json:"message,omitempty"`
207+
208+
// The last time this condition was updated.
209+
// +optional
210+
LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"`
211+
212+
// Last time the condition transitioned from one status to another.
213+
// +optional
214+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
215+
}
216+
217+
// JobConditionType defines all kinds of types of JobStatus.
218+
type JobConditionType string
219+
220+
const (
221+
// JobCreated means the job has been accepted by the system,
222+
// but one or more of the pods/services has not been started.
223+
// This includes time before pods being scheduled and launched.
224+
JobCreated JobConditionType = "Created"
225+
226+
// JobRunning means all sub-resources (e.g. services/pods) of this job
227+
// have been successfully scheduled and launched.
228+
// The training is running without error.
229+
JobRunning JobConditionType = "Running"
230+
231+
// JobRestarting means one or more sub-resources (e.g. services/pods) of this job
232+
// reached phase failed but maybe restarted according to it's restart policy
233+
// which specified by user in v1.PodTemplateSpec.
234+
// The training is freezing/pending.
235+
JobRestarting JobConditionType = "Restarting"
236+
237+
// JobSucceeded means all sub-resources (e.g. services/pods) of this job
238+
// reached phase have terminated in success.
239+
// The training is complete without error.
240+
JobSucceeded JobConditionType = "Succeeded"
241+
242+
// JobFailed means one or more sub-resources (e.g. services/pods) of this job
243+
// reached phase failed with no restarting.
244+
// The training has failed its execution.
245+
JobFailed JobConditionType = "Failed"
246+
)

0 commit comments

Comments
 (0)