Skip to content

Commit 246e2f9

Browse files
committed
Start/Stop API Implementation
Signed-off-by: Britania Rodriguez Reyes <[email protected]>
1 parent d7e9d01 commit 246e2f9

File tree

8 files changed

+610
-74
lines changed

8 files changed

+610
-74
lines changed

pkg/controllers/updaterun/controller.go

Lines changed: 85 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,26 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
104104
// Emit the update run status metric based on status conditions in the updateRun.
105105
defer emitUpdateRunStatusMetric(updateRun)
106106

107+
// Early check for abandoned state - this is a terminal state, no initialization needed.
108+
state := updateRun.GetUpdateRunSpec().State
109+
if state == placementv1beta1.StateAbandoned {
110+
klog.V(2).InfoS("The updateRun is abandoned, terminating", "state", state, "updateRun", runObjRef)
111+
return runtime.Result{}, r.recordUpdateRunAbandoned(ctx, updateRun)
112+
} else if state == placementv1beta1.StateStopped { // Early check for stopped state - pause the update run if needed.
113+
klog.V(2).InfoS("The updateRun is stopped, waiting to resume", "state", state, "updateRun", runObjRef)
114+
return runtime.Result{}, r.recordUpdateRunPaused(ctx, updateRun)
115+
}
116+
107117
var updatingStageIndex int
108118
var toBeUpdatedBindings, toBeDeletedBindings []placementv1beta1.BindingObj
109119
updateRunStatus := updateRun.GetUpdateRunStatus()
110120
initCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized))
111-
if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) {
121+
// Check if initialized regardless of generation.
122+
// The updateRun spec fields are immutable except for the state field. When the state changes,
123+
// the update run generation increments, but we don't need to reinitialize since initialization is a one-time setup.
124+
isInitialized := initCond != nil && initCond.Status == metav1.ConditionTrue
125+
if !isInitialized {
126+
// Check if initialization failed for the current generation.
112127
if condition.IsConditionStatusFalse(initCond, updateRun.GetGeneration()) {
113128
klog.V(2).InfoS("The updateRun has failed to initialize", "errorMsg", initCond.Message, "updateRun", runObjRef)
114129
return runtime.Result{}, nil
@@ -122,7 +137,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
122137
}
123138
return runtime.Result{}, initErr
124139
}
125-
updatingStageIndex = 0 // start from the first stage.
140+
updatingStageIndex = 0 // start from the first stage (typically for NotStarted or Started states).
126141
klog.V(2).InfoS("Initialized the updateRun", "updateRun", runObjRef)
127142
} else {
128143
klog.V(2).InfoS("The updateRun is initialized", "updateRun", runObjRef)
@@ -134,6 +149,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
134149
}
135150
var validateErr error
136151
// Validate the updateRun status to ensure the update can be continued and get the updating stage index and cluster indices.
152+
// For Stopped → Started transition, this will resume from where it left off.
137153
if updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings, validateErr = r.validate(ctx, updateRun); validateErr != nil {
138154
// errStagedUpdatedAborted cannot be retried.
139155
if errors.Is(validateErr, errStagedUpdatedAborted) {
@@ -151,28 +167,32 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
151167
}
152168

153169
// Execute the updateRun.
154-
klog.V(2).InfoS("Continue to execute the updateRun", "updatingStageIndex", updatingStageIndex, "updateRun", runObjRef)
155-
finished, waitTime, execErr := r.execute(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
156-
if errors.Is(execErr, errStagedUpdatedAborted) {
157-
// errStagedUpdatedAborted cannot be retried.
158-
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, execErr.Error())
159-
}
170+
if state == placementv1beta1.StateStarted {
171+
klog.V(2).InfoS("Continue to execute the updateRun", "updatingStageIndex", updatingStageIndex, "updateRun", runObjRef)
172+
finished, waitTime, execErr := r.execute(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
173+
if errors.Is(execErr, errStagedUpdatedAborted) {
174+
// errStagedUpdatedAborted cannot be retried.
175+
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, execErr.Error())
176+
}
160177

161-
if finished {
162-
klog.V(2).InfoS("The updateRun is completed", "updateRun", runObjRef)
163-
return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, updateRun)
164-
}
178+
if finished {
179+
klog.V(2).InfoS("The updateRun is completed", "updateRun", runObjRef)
180+
return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, updateRun)
181+
}
165182

166-
// The execution is not finished yet or it encounters a retriable error.
167-
// We need to record the status and requeue.
168-
if updateErr := r.recordUpdateRunStatus(ctx, updateRun); updateErr != nil {
169-
return runtime.Result{}, updateErr
170-
}
171-
klog.V(2).InfoS("The updateRun is not finished yet", "requeueWaitTime", waitTime, "execErr", execErr, "updateRun", runObjRef)
172-
if execErr != nil {
173-
return runtime.Result{}, execErr
183+
// The execution is not finished yet or it encounters a retriable error.
184+
// We need to record the status and requeue.
185+
if updateErr := r.recordUpdateRunStatus(ctx, updateRun); updateErr != nil {
186+
return runtime.Result{}, updateErr
187+
}
188+
klog.V(2).InfoS("The updateRun is not finished yet", "requeueWaitTime", waitTime, "execErr", execErr, "updateRun", runObjRef)
189+
if execErr != nil {
190+
return runtime.Result{}, execErr
191+
}
192+
return runtime.Result{Requeue: true, RequeueAfter: waitTime}, nil
174193
}
175-
return runtime.Result{Requeue: true, RequeueAfter: waitTime}, nil
194+
klog.V(2).InfoS("The updateRun is not started, waiting to be started", "state", state, "updateRun", runObjRef)
195+
return runtime.Result{}, nil
176196
}
177197

178198
// handleDelete handles the deletion of the updateRun object.
@@ -265,6 +285,50 @@ func (r *Reconciler) recordUpdateRunFailed(ctx context.Context, updateRun placem
265285
return nil
266286
}
267287

288+
// recordUpdateRunPaused records the progressing condition as paused in the updateRun status.
289+
func (r *Reconciler) recordUpdateRunPaused(ctx context.Context, updateRun placementv1beta1.UpdateRunObj) error {
290+
updateRunStatus := updateRun.GetUpdateRunStatus()
291+
meta.SetStatusCondition(&updateRunStatus.Conditions, metav1.Condition{
292+
Type: string(placementv1beta1.StagedUpdateRunConditionProgressing),
293+
Status: metav1.ConditionFalse,
294+
ObservedGeneration: updateRun.GetGeneration(),
295+
Reason: condition.UpdateRunPausedReason,
296+
Message: "The update run is paused",
297+
})
298+
if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil {
299+
klog.ErrorS(updateErr, "Failed to update the updateRun status as paused", "updateRun", klog.KObj(updateRun))
300+
// updateErr can be retried.
301+
return controller.NewUpdateIgnoreConflictError(updateErr)
302+
}
303+
return nil
304+
}
305+
306+
// recordUpdateRunAbandoned records the succeeded and progressing condition as abandoned in the updateRun status.
307+
func (r *Reconciler) recordUpdateRunAbandoned(ctx context.Context, updateRun placementv1beta1.UpdateRunObj) error {
308+
updateRunStatus := updateRun.GetUpdateRunStatus()
309+
meta.SetStatusCondition(&updateRunStatus.Conditions, metav1.Condition{
310+
Type: string(placementv1beta1.StagedUpdateRunConditionProgressing),
311+
Status: metav1.ConditionFalse,
312+
ObservedGeneration: updateRun.GetGeneration(),
313+
Reason: condition.UpdateRunAbandonedReason,
314+
Message: "The stages are aborted due to abandonment",
315+
})
316+
meta.SetStatusCondition(&updateRunStatus.Conditions, metav1.Condition{
317+
Type: string(placementv1beta1.StagedUpdateRunConditionSucceeded),
318+
Status: metav1.ConditionFalse,
319+
ObservedGeneration: updateRun.GetGeneration(),
320+
Reason: condition.UpdateRunAbandonedReason,
321+
Message: "The update run has been abandoned",
322+
})
323+
324+
if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil {
325+
klog.ErrorS(updateErr, "Failed to update the updateRun status as failed", "updateRun", klog.KObj(updateRun))
326+
// updateErr can be retried.
327+
return controller.NewUpdateIgnoreConflictError(updateErr)
328+
}
329+
return nil
330+
}
331+
268332
// recordUpdateRunStatus records the updateRun status.
269333
func (r *Reconciler) recordUpdateRunStatus(ctx context.Context, updateRun placementv1beta1.UpdateRunObj) error {
270334
if updateErr := r.Client.Status().Update(ctx, updateRun); updateErr != nil {

pkg/controllers/updaterun/controller_integration_test.go

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,16 @@ func generateMetricsLabels(
272272
}
273273
}
274274

275+
func generateInitializationSucceededMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
276+
return &prometheusclientmodel.Metric{
277+
Label: generateMetricsLabels(updateRun, string(placementv1beta1.StagedUpdateRunConditionInitialized),
278+
string(metav1.ConditionTrue), condition.UpdateRunInitializeSucceededReason),
279+
Gauge: &prometheusclientmodel.Gauge{
280+
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
281+
},
282+
}
283+
}
284+
275285
func generateInitializationFailedMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
276286
return &prometheusclientmodel.Metric{
277287
Label: generateMetricsLabels(updateRun, string(placementv1beta1.StagedUpdateRunConditionInitialized),
@@ -312,6 +322,26 @@ func generateStuckMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *pr
312322
}
313323
}
314324

325+
func generatePausedMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
326+
return &prometheusclientmodel.Metric{
327+
Label: generateMetricsLabels(updateRun, string(placementv1beta1.StagedUpdateRunConditionProgressing),
328+
string(metav1.ConditionFalse), condition.UpdateRunPausedReason),
329+
Gauge: &prometheusclientmodel.Gauge{
330+
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
331+
},
332+
}
333+
}
334+
335+
func generateAbandonedMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
336+
return &prometheusclientmodel.Metric{
337+
Label: generateMetricsLabels(updateRun, string(placementv1beta1.StagedUpdateRunConditionSucceeded),
338+
string(metav1.ConditionFalse), condition.UpdateRunAbandonedReason),
339+
Gauge: &prometheusclientmodel.Gauge{
340+
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
341+
},
342+
}
343+
}
344+
315345
func generateFailedMetric(updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
316346
return &prometheusclientmodel.Metric{
317347
Label: generateMetricsLabels(updateRun, string(placementv1beta1.StagedUpdateRunConditionSucceeded),
@@ -341,6 +371,7 @@ func generateTestClusterStagedUpdateRun() *placementv1beta1.ClusterStagedUpdateR
341371
PlacementName: testCRPName,
342372
ResourceSnapshotIndex: testResourceSnapshotIndex,
343373
StagedUpdateStrategyName: testUpdateStrategyName,
374+
State: placementv1beta1.StateStarted,
344375
},
345376
}
346377
}
@@ -796,23 +827,14 @@ func generateFalseCondition(obj client.Object, condType any) metav1.Condition {
796827
}
797828
}
798829

799-
func generateFalseProgressingCondition(obj client.Object, condType any, succeeded bool) metav1.Condition {
830+
func generateFalseProgressingCondition(obj client.Object, condType any, reason string) metav1.Condition {
831+
falseCond := generateFalseCondition(obj, condType)
832+
falseCond.Reason = reason
833+
return falseCond
834+
}
835+
836+
func generateFalseSucceededCondition(obj client.Object, condType any, reason string) metav1.Condition {
800837
falseCond := generateFalseCondition(obj, condType)
801-
reason := ""
802-
switch condType {
803-
case placementv1beta1.StagedUpdateRunConditionProgressing:
804-
if succeeded {
805-
reason = condition.UpdateRunSucceededReason
806-
} else {
807-
reason = condition.UpdateRunFailedReason
808-
}
809-
case placementv1beta1.StageUpdatingConditionProgressing:
810-
if succeeded {
811-
reason = condition.StageUpdatingSucceededReason
812-
} else {
813-
reason = condition.StageUpdatingFailedReason
814-
}
815-
}
816838
falseCond.Reason = reason
817839
return falseCond
818840
}

pkg/controllers/updaterun/execution.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,13 @@ func (r *Reconciler) executeUpdatingStage(
122122
for i := 0; i < len(updatingStageStatus.Clusters) && clusterUpdatingCount < maxConcurrency; i++ {
123123
clusterStatus := &updatingStageStatus.Clusters[i]
124124
clusterUpdateSucceededCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded))
125-
if condition.IsConditionStatusTrue(clusterUpdateSucceededCond, updateRun.GetGeneration()) {
125+
if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionTrue {
126126
// The cluster has been updated successfully.
127127
finishedClusterCount++
128128
continue
129129
}
130130
clusterUpdatingCount++
131-
if condition.IsConditionStatusFalse(clusterUpdateSucceededCond, updateRun.GetGeneration()) {
131+
if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionFalse {
132132
// The cluster is marked as failed to update, this cluster is counted as updating cluster since it's not finished to avoid processing more clusters than maxConcurrency in this round.
133133
failedErr := fmt.Errorf("the cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName)
134134
klog.ErrorS(failedErr, "The cluster has failed to be updated", "updateRun", updateRunRef)
@@ -138,7 +138,7 @@ func (r *Reconciler) executeUpdatingStage(
138138
// The cluster needs to be processed.
139139
clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted))
140140
binding := toBeUpdatedBindingsMap[clusterStatus.ClusterName]
141-
if !condition.IsConditionStatusTrue(clusterStartedCond, updateRun.GetGeneration()) {
141+
if clusterStartedCond == nil || clusterStartedCond.Status == metav1.ConditionFalse {
142142
// The cluster has not started updating yet.
143143
if !isBindingSyncedWithClusterStatus(resourceSnapshotName, updateRun, binding, clusterStatus) {
144144
klog.V(2).InfoS("Found the first cluster that needs to be updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
@@ -293,7 +293,8 @@ func (r *Reconciler) executeDeleteStage(
293293
// In validation, we already check the binding must exist in the status.
294294
delete(existingDeleteStageClusterMap, bindingSpec.TargetCluster)
295295
// Make sure the cluster is not marked as deleted as the binding is still there.
296-
if condition.IsConditionStatusTrue(meta.FindStatusCondition(curCluster.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded)), updateRun.GetGeneration()) {
296+
clusterDeleteSucceededCond := meta.FindStatusCondition(curCluster.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded))
297+
if clusterDeleteSucceededCond != nil && clusterDeleteSucceededCond.Status == metav1.ConditionTrue {
297298
unexpectedErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("the deleted cluster `%s` in the deleting stage still has a binding", bindingSpec.TargetCluster))
298299
klog.ErrorS(unexpectedErr, "The cluster in the deleting stage is not removed yet but marked as deleted", "cluster", curCluster.ClusterName, "updateRun", updateRunRef)
299300
return false, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error())

0 commit comments

Comments
 (0)