From 8756d7003e7ba2b80e22d990959f3bce7d62d4e0 Mon Sep 17 00:00:00 2001
From: Hongkai Liu <hongkailiu@users.noreply.github.com>
Date: Tue, 23 Sep 2025 15:32:11 -0400
Subject: [PATCH 1/2] ClusterOperators should not go Progressing only for
 cluster scaling

This is to cover the cluster scaling case from the rule [1] that is
introduced recently:

```
Operators should not report Progressing only because DaemonSets
owned by them are adjusting to a new node from cluster scaleup or
a node rebooting from cluster upgrade.
```

The test plugs into the existing scaling test. It checks each
CO's Progressing condition before and after the test, and identifies
every CO that either left Progressing=False or re-entered
Progressing=False with a different LastTransitionTime.

[1]. https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164
---
 test/extended/machines/scale.go | 40 ++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go
index 695a6ce35816..58ebde800429 100644
--- a/test/extended/machines/scale.go
+++ b/test/extended/machines/scale.go
@@ -9,6 +9,8 @@ import (
 
 	g "github.com/onsi/ginkgo/v2"
 	o "github.com/onsi/gomega"
+	configv1 "github.com/openshift/api/config/v1"
+	configclient "github.com/openshift/client-go/config/clientset/versioned"
 	bmhelper "github.com/openshift/origin/test/extended/baremetal"
 	"github.com/stretchr/objx"
 	corev1 "k8s.io/api/core/v1"
@@ -186,12 +188,28 @@ func scaleMachineSet(name string, replicas int) error {
 	return nil
 }
 
+func getOperatorsNotProgressing(c configclient.Interface) map[string]metav1.Time {
+	operators, err := c.ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
+	o.Expect(err).NotTo(o.HaveOccurred())
+	result := map[string]metav1.Time{}
+	for _, operator := range operators.Items {
+		for _, condition := range operator.Status.Conditions {
+			if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionFalse {
+				result[operator.Name] = condition.LastTransitionTime
+			}
+		}
+	}
+	return result
+}
+
 var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cluster should", func() {
 
 	var (
-		c      *kubernetes.Clientset
-		dc     dynamic.Interface
-		helper *bmhelper.BaremetalTestHelper
+		c                       *kubernetes.Clientset
+		configClient            configclient.Interface
+		dc                      dynamic.Interface
+		helper                  *bmhelper.BaremetalTestHelper
+		operatorsNotProgressing map[string]metav1.Time
 	)
 
 	g.BeforeEach(func() {
@@ -210,10 +228,26 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl
 			helper.Setup()
 			helper.DeployExtraWorker(0)
 		}
+
+		configClient, err = configclient.NewForConfig(cfg)
+		o.Expect(err).NotTo(o.HaveOccurred())
+		operatorsNotProgressing = getOperatorsNotProgressing(configClient)
 	})
 
 	g.AfterEach(func() {
 		helper.DeleteAllExtraWorkers()
+
+		// No cluster operator should leave Progressing=False only up to cluster scaling
+		// https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164
+		operatorsNotProgressingAfter := getOperatorsNotProgressing(configClient)
+		var violations []string
+		for operator, t1 := range operatorsNotProgressing {
+			t2, ok := operatorsNotProgressingAfter[operator]
+			if !ok || t1.Unix() != t2.Unix() {
+				violations = append(violations, operator)
+			}
+		}
+		o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)
 	})
 
 	// The 30m timeout is essentially required by the baremetal platform environment,

From c9c5fa5f89028e3587e8322145678692af68c6c7 Mon Sep 17 00:00:00 2001
From: Hongkai Liu <hongkailiu@users.noreply.github.com>
Date: Thu, 25 Sep 2025 11:24:35 -0400
Subject: [PATCH 2/2] Add exceptions for the violating COs

The bugs are created for the case of node rebooting. The condition
goes to Progressing=True with the same reason that we found for the
cluster scaling up/down. Thus, we re-use the bugs instead of
recreating a new set of bugs that might be closed as duplciates.
---
 test/extended/machines/scale.go | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go
index 58ebde800429..7e0dfa04aee9 100644
--- a/test/extended/machines/scale.go
+++ b/test/extended/machines/scale.go
@@ -237,13 +237,30 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl
 	g.AfterEach(func() {
 		helper.DeleteAllExtraWorkers()
 
+		except := func(co string) string {
+			switch co {
+			case "dns":
+				return "https://issues.redhat.com/browse/OCPBUGS-62623"
+			case "image-registry":
+				return "https://issues.redhat.com/browse/OCPBUGS-62626"
+			case "network":
+				return "https://issues.redhat.com/browse/OCPBUGS-62630"
+			case "node-tuning":
+				return "https://issues.redhat.com/browse/OCPBUGS-62632"
+			case "storage":
+				return "https://issues.redhat.com/browse/OCPBUGS-62633"
+			default:
+				return ""
+			}
+		}
+
 		// No cluster operator should leave Progressing=False only up to cluster scaling
 		// https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164
 		operatorsNotProgressingAfter := getOperatorsNotProgressing(configClient)
 		var violations []string
 		for operator, t1 := range operatorsNotProgressing {
 			t2, ok := operatorsNotProgressingAfter[operator]
-			if !ok || t1.Unix() != t2.Unix() {
+			if reason := except(operator); reason == "" && (!ok || t1.Unix() != t2.Unix()) {
 				violations = append(violations, operator)
 			}
 		}