refactor

britaniar · britaniar · commit ccd0b2565540 · 2025-11-24T18:50:55.000-06:00
Signed-off-by: Britania Rodriguez Reyes &lt;britaniar@microsoft.com&gt;
diff --git a/pkg/controllers/updaterun/execution.go b/pkg/controllers/updaterun/execution.go
@@ -107,6 +107,7 @@ func (r *Reconciler) executeUpdatingStage(
 	resourceIndex, _ := strconv.Atoi(updateRunStatus.ResourceSnapshotIndexUsed)
 	resourceSnapshotName := fmt.Sprintf(placementv1beta1.ResourceSnapshotNameFmt, updateRunSpec.PlacementName, resourceIndex)
 	updateRunRef := klog.KObj(updateRun)
+
 	// Create the map of the toBeUpdatedBindings.
 	toBeUpdatedBindingsMap := make(map[string]placementv1beta1.BindingObj, len(toBeUpdatedBindings))
 	for _, binding := range toBeUpdatedBindings {
@@ -118,101 +119,47 @@ func (r *Reconciler) executeUpdatingStage(
 	clusterUpdatingCount := 0
 	var stuckClusterNames []string
 	var clusterUpdateErrors []error
+
 	// Go through each cluster in the stage and check if it's updating/succeeded/failed.
 	for i := 0; i < len(updatingStageStatus.Clusters) && clusterUpdatingCount < maxConcurrency; i++ {
 		clusterStatus := &updatingStageStatus.Clusters[i]
-		clusterUpdateSucceededCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded))
-		if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionTrue {
-			// The cluster has been updated successfully.
+		
+		// Process cluster status to determine if we should skip or handle errors
+		processResult := r.processClusterStatus(clusterStatus, updatingStageStatus, updateRunRef)
+		if processResult.skip {
 			finishedClusterCount++
 			continue
 		}
-		clusterUpdatingCount++
-		if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionFalse {
-			// The cluster is marked as failed to update, this cluster is counted as updating cluster since it's not finished to avoid processing more clusters than maxConcurrency in this round.
-			failedErr := fmt.Errorf("the cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName)
-			klog.ErrorS(failedErr, "The cluster has failed to be updated", "updateRun", updateRunRef)
-			clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()))
+		if processResult.err != nil {
+			clusterUpdatingCount++
+			clusterUpdateErrors = append(clusterUpdateErrors, processResult.err)
 			continue
 		}
-		// The cluster needs to be processed.
-		clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted))
+
+		clusterUpdatingCount++
 		binding := toBeUpdatedBindingsMap[clusterStatus.ClusterName]
+		clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted))
+
 		if clusterStartedCond == nil || clusterStartedCond.Status == metav1.ConditionFalse {
 			// The cluster has not started updating yet.
-			if !isBindingSyncedWithClusterStatus(resourceSnapshotName, updateRun, binding, clusterStatus) {
-				klog.V(2).InfoS("Found the first cluster that needs to be updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-				// The binding is not up-to-date with the cluster status.
-				bindingSpec := binding.GetBindingSpec()
-				bindingSpec.State = placementv1beta1.BindingStateBound
-				bindingSpec.ResourceSnapshotName = resourceSnapshotName
-				bindingSpec.ResourceOverrideSnapshots = clusterStatus.ResourceOverrideSnapshots
-				bindingSpec.ClusterResourceOverrideSnapshots = clusterStatus.ClusterResourceOverrideSnapshots
-				bindingSpec.ApplyStrategy = updateRunStatus.ApplyStrategy
-				if err := r.Client.Update(ctx, binding); err != nil {
-					klog.ErrorS(err, "Failed to update binding to be bound with the matching spec of the updateRun", "binding", klog.KObj(binding), "updateRun", updateRunRef)
-					clusterUpdateErrors = append(clusterUpdateErrors, controller.NewUpdateIgnoreConflictError(err))
-					continue
-				}
-				klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-				if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil {
-					clusterUpdateErrors = append(clusterUpdateErrors, err)
-					continue
-				}
-			} else {
-				klog.V(2).InfoS("Found the first binding that is updating but the cluster status has not been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-				bindingSpec := binding.GetBindingSpec()
-				if bindingSpec.State != placementv1beta1.BindingStateBound {
-					bindingSpec.State = placementv1beta1.BindingStateBound
-					if err := r.Client.Update(ctx, binding); err != nil {
-						klog.ErrorS(err, "Failed to update a binding to be bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-						clusterUpdateErrors = append(clusterUpdateErrors, controller.NewUpdateIgnoreConflictError(err))
-						continue
-					}
-					klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-					if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil {
-						clusterUpdateErrors = append(clusterUpdateErrors, err)
-						continue
-					}
-				} else if !condition.IsConditionStatusTrue(meta.FindStatusCondition(binding.GetBindingStatus().Conditions, string(placementv1beta1.ResourceBindingRolloutStarted)), binding.GetGeneration()) {
-					klog.V(2).InfoS("The binding is bound and up-to-date but the generation is updated by the scheduler, update rolloutStarted status again", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-					if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil {
-						clusterUpdateErrors = append(clusterUpdateErrors, err)
-						continue
-					}
-				} else {
-					if _, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun); updateErr != nil {
-						clusterUpdateErrors = append(clusterUpdateErrors, updateErr)
-						continue
-					}
-				}
+			if err := r.updateBindingForCluster(ctx, binding, clusterStatus, resourceSnapshotName, updateRun, updatingStageStatus); err != nil {
+				clusterUpdateErrors = append(clusterUpdateErrors, err)
+				continue
 			}
 			markClusterUpdatingStarted(clusterStatus, updateRun.GetGeneration())
 			if finishedClusterCount == 0 {
 				markStageUpdatingStarted(updatingStageStatus, updateRun.GetGeneration())
 			}
-			// Need to continue as we need to process at most maxConcurrency number of clusters in parallel.
 			continue
 		}
 
-		// Now the cluster has to be updating, the binding should point to the right resource snapshot and the binding should be bound.
-		inSync := isBindingSyncedWithClusterStatus(resourceSnapshotName, updateRun, binding, clusterStatus)
-		rolloutStarted := condition.IsConditionStatusTrue(meta.FindStatusCondition(binding.GetBindingStatus().Conditions, string(placementv1beta1.ResourceBindingRolloutStarted)), binding.GetGeneration())
-		bindingSpec := binding.GetBindingSpec()
-		if !inSync || !rolloutStarted || bindingSpec.State != placementv1beta1.BindingStateBound {
-			// This issue mostly happens when there are concurrent updateRuns referencing the same clusterResourcePlacement but releasing different versions.
-			// After the 1st updateRun updates the binding, and before the controller re-checks the binding status, the 2nd updateRun updates the same binding, and thus the 1st updateRun is preempted and observes the binding not matching the desired state.
-			preemptedErr := controller.NewUserError(fmt.Errorf("the binding of the updating cluster `%s` in the stage `%s` is not up-to-date with the desired status, "+
-				"please check the status of binding `%s` and see if there is a concurrent updateRun referencing the same clusterResourcePlacement and updating the same cluster",
-				clusterStatus.ClusterName, updatingStageStatus.StageName, klog.KObj(binding)))
-			klog.ErrorS(preemptedErr, "The binding has been changed during updating",
-				"bindingSpecInSync", inSync, "bindingState", bindingSpec.State,
-				"bindingRolloutStarted", rolloutStarted, "binding", klog.KObj(binding), "updateRun", updateRunRef)
-			markClusterUpdatingFailed(clusterStatus, updateRun.GetGeneration(), preemptedErr.Error())
-			clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %s", errStagedUpdatedAborted, preemptedErr.Error()))
+		// The cluster is already updating - validate it's properly synchronized
+		if err := r.validateUpdatingCluster(binding, clusterStatus, resourceSnapshotName, updateRun, updatingStageStatus); err != nil {
+			clusterUpdateErrors = append(clusterUpdateErrors, err)
 			continue
 		}
 
+		// Check if the cluster update has finished
 		finished, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun)
 		if updateErr != nil {
 			clusterUpdateErrors = append(clusterUpdateErrors, updateErr)
@@ -222,7 +169,7 @@ func (r *Reconciler) executeUpdatingStage(
 			// The cluster has finished successfully, we can process another cluster in this round.
 			clusterUpdatingCount--
 		} else {
-			// If cluster update has been running for more than "updateRunStuckThreshold", mark the update run as stuck.
+			// Check if cluster is stuck
 			timeElapsed := time.Since(clusterStartedCond.LastTransitionTime.Time)
 			if timeElapsed > updateRunStuckThreshold {
 				klog.V(2).InfoS("Time waiting for cluster update to finish passes threshold, mark the update run as stuck", "time elapsed", timeElapsed, "threshold", updateRunStuckThreshold, "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
@@ -241,29 +188,163 @@ func (r *Reconciler) executeUpdatingStage(
 	}
 
 	if finishedClusterCount == len(updatingStageStatus.Clusters) {
-		// All the clusters in the stage have been updated.
-		markUpdateRunWaiting(updateRun, updatingStageStatus.StageName)
-		markStageUpdatingWaiting(updatingStageStatus, updateRun.GetGeneration())
-		klog.V(2).InfoS("The stage has finished all cluster updating", "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
-		// Check if the after stage tasks are ready.
-		approved, waitTime, err := r.checkAfterStageTasksStatus(ctx, updatingStageIndex, updateRun)
-		if err != nil {
-			return 0, err
+		return r.handleStageCompletion(ctx, updatingStageIndex, updateRun, updatingStageStatus)
+	}
+
+	// Some clusters are still updating.
+	return clusterUpdatingWaitTime, nil
+}
+
+// clusterProcessResult represents the result of processing a cluster.
+type clusterProcessResult struct {
+	finished bool
+	skip     bool // true if the cluster should be skipped (already processed)
+	err      error
+}
+
+// processClusterStatus evaluates the status of a cluster and determines if it's finished, failed, or needs processing.
+// Returns a clusterProcessResult indicating how to proceed with this cluster.
+func (r *Reconciler) processClusterStatus(
+	clusterStatus *placementv1beta1.ClusterUpdatingStatus,
+	updatingStageStatus *placementv1beta1.StageUpdatingStatus,
+	updateRunRef klog.ObjectRef,
+) clusterProcessResult {
+	clusterUpdateSucceededCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded))
+	
+	if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionTrue {
+		// The cluster has been updated successfully.
+		return clusterProcessResult{finished: true, skip: true}
+	}
+	
+	if clusterUpdateSucceededCond != nil && clusterUpdateSucceededCond.Status == metav1.ConditionFalse {
+		// The cluster is marked as failed to update, this cluster is counted as updating cluster 
+		// since it's not finished to avoid processing more clusters than maxConcurrency in this round.
+		failedErr := fmt.Errorf("the cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName)
+		klog.ErrorS(failedErr, "The cluster has failed to be updated", "updateRun", updateRunRef)
+		return clusterProcessResult{
+			err: fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()),
 		}
-		if approved {
-			markUpdateRunProgressing(updateRun)
-			markStageUpdatingSucceeded(updatingStageStatus, updateRun.GetGeneration())
-			// No need to wait to get to the next stage.
-			return 0, nil
+	}
+	
+	// The cluster needs to be processed.
+	return clusterProcessResult{}
+}
+
+// updateBindingForCluster handles updating the binding for a cluster that hasn't started updating yet.
+// Returns an error if the binding update fails.
+func (r *Reconciler) updateBindingForCluster(
+	ctx context.Context,
+	binding placementv1beta1.BindingObj,
+	clusterStatus *placementv1beta1.ClusterUpdatingStatus,
+	resourceSnapshotName string,
+	updateRun placementv1beta1.UpdateRunObj,
+	updatingStageStatus *placementv1beta1.StageUpdatingStatus,
+) error {
+	updateRunRef := klog.KObj(updateRun)
+	updateRunStatus := updateRun.GetUpdateRunStatus()
+	
+	if !isBindingSyncedWithClusterStatus(resourceSnapshotName, updateRun, binding, clusterStatus) {
+		klog.V(2).InfoS("Found the first cluster that needs to be updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+		// The binding is not up-to-date with the cluster status.
+		bindingSpec := binding.GetBindingSpec()
+		bindingSpec.State = placementv1beta1.BindingStateBound
+		bindingSpec.ResourceSnapshotName = resourceSnapshotName
+		bindingSpec.ResourceOverrideSnapshots = clusterStatus.ResourceOverrideSnapshots
+		bindingSpec.ClusterResourceOverrideSnapshots = clusterStatus.ClusterResourceOverrideSnapshots
+		bindingSpec.ApplyStrategy = updateRunStatus.ApplyStrategy
+		if err := r.Client.Update(ctx, binding); err != nil {
+			klog.ErrorS(err, "Failed to update binding to be bound with the matching spec of the updateRun", "binding", klog.KObj(binding), "updateRun", updateRunRef)
+			return controller.NewUpdateIgnoreConflictError(err)
+		}
+		klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+		return r.updateBindingRolloutStarted(ctx, binding, updateRun)
+	}
+	
+	// The binding is synced but needs other updates
+	klog.V(2).InfoS("Found the first binding that is updating but the cluster status has not been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+	bindingSpec := binding.GetBindingSpec()
+	if bindingSpec.State != placementv1beta1.BindingStateBound {
+		bindingSpec.State = placementv1beta1.BindingStateBound
+		if err := r.Client.Update(ctx, binding); err != nil {
+			klog.ErrorS(err, "Failed to update a binding to be bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+			return controller.NewUpdateIgnoreConflictError(err)
 		}
-		// The after stage tasks are not ready yet.
-		if waitTime < 0 {
-			waitTime = stageUpdatingWaitTime
+		klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+		return r.updateBindingRolloutStarted(ctx, binding, updateRun)
+	} else if !condition.IsConditionStatusTrue(meta.FindStatusCondition(binding.GetBindingStatus().Conditions, string(placementv1beta1.ResourceBindingRolloutStarted)), binding.GetGeneration()) {
+		klog.V(2).InfoS("The binding is bound and up-to-date but the generation is updated by the scheduler, update rolloutStarted status again", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+		return r.updateBindingRolloutStarted(ctx, binding, updateRun)
+	} else {
+		if _, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun); updateErr != nil {
+			return updateErr
 		}
-		return waitTime, nil
 	}
-	// Some clusters are still updating.
-	return clusterUpdatingWaitTime, nil
+	return nil
+}
+
+// validateUpdatingCluster validates that an updating cluster's binding is properly synchronized.
+// Returns an error if validation fails (indicating a concurrent update conflict).
+func (r *Reconciler) validateUpdatingCluster(
+	binding placementv1beta1.BindingObj,
+	clusterStatus *placementv1beta1.ClusterUpdatingStatus,
+	resourceSnapshotName string,
+	updateRun placementv1beta1.UpdateRunObj,
+	updatingStageStatus *placementv1beta1.StageUpdatingStatus,
+) error {
+	updateRunRef := klog.KObj(updateRun)
+	
+	// Now the cluster has to be updating, the binding should point to the right resource snapshot and the binding should be bound.
+	inSync := isBindingSyncedWithClusterStatus(resourceSnapshotName, updateRun, binding, clusterStatus)
+	rolloutStarted := condition.IsConditionStatusTrue(meta.FindStatusCondition(binding.GetBindingStatus().Conditions, string(placementv1beta1.ResourceBindingRolloutStarted)), binding.GetGeneration())
+	bindingSpec := binding.GetBindingSpec()
+	
+	if !inSync || !rolloutStarted || bindingSpec.State != placementv1beta1.BindingStateBound {
+		// This issue mostly happens when there are concurrent updateRuns referencing the same clusterResourcePlacement but releasing different versions.
+		// After the 1st updateRun updates the binding, and before the controller re-checks the binding status, the 2nd updateRun updates the same binding, and thus the 1st updateRun is preempted and observes the binding not matching the desired state.
+		preemptedErr := controller.NewUserError(fmt.Errorf("the binding of the updating cluster `%s` in the stage `%s` is not up-to-date with the desired status, "+
+			"please check the status of binding `%s` and see if there is a concurrent updateRun referencing the same clusterResourcePlacement and updating the same cluster",
+			clusterStatus.ClusterName, updatingStageStatus.StageName, klog.KObj(binding)))
+		klog.ErrorS(preemptedErr, "The binding has been changed during updating",
+			"bindingSpecInSync", inSync, "bindingState", bindingSpec.State,
+			"bindingRolloutStarted", rolloutStarted, "binding", klog.KObj(binding), "updateRun", updateRunRef)
+		markClusterUpdatingFailed(clusterStatus, updateRun.GetGeneration(), preemptedErr.Error())
+		return fmt.Errorf("%w: %s", errStagedUpdatedAborted, preemptedErr.Error())
+	}
+	
+	return nil
+}
+
+// handleStageCompletion handles the completion logic when all clusters in a stage are finished.
+// Returns the wait time and any error encountered.
+func (r *Reconciler) handleStageCompletion(
+	ctx context.Context,
+	updatingStageIndex int,
+	updateRun placementv1beta1.UpdateRunObj,
+	updatingStageStatus *placementv1beta1.StageUpdatingStatus,
+) (time.Duration, error) {
+	updateRunRef := klog.KObj(updateRun)
+	
+	// All the clusters in the stage have been updated.
+	markUpdateRunWaiting(updateRun, updatingStageStatus.StageName)
+	markStageUpdatingWaiting(updatingStageStatus, updateRun.GetGeneration())
+	klog.V(2).InfoS("The stage has finished all cluster updating", "stage", updatingStageStatus.StageName, "updateRun", updateRunRef)
+	
+	// Check if the after stage tasks are ready.
+	approved, waitTime, err := r.checkAfterStageTasksStatus(ctx, updatingStageIndex, updateRun)
+	if err != nil {
+		return 0, err
+	}
+	if approved {
+		markUpdateRunProgressing(updateRun)
+		markStageUpdatingSucceeded(updatingStageStatus, updateRun.GetGeneration())
+		// No need to wait to get to the next stage.
+		return 0, nil
+	}
+	// The after stage tasks are not ready yet.
+	if waitTime < 0 {
+		waitTime = stageUpdatingWaitTime
+	}
+	return waitTime, nil
 }
 
 // executeDeleteStage executes the delete stage by deleting the bindings.