From 8756d7003e7ba2b80e22d990959f3bce7d62d4e0 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 23 Sep 2025 15:32:11 -0400 Subject: [PATCH 1/2] ClusterOperators should not go Progressing only for cluster scaling This is to cover the cluster scaling case from the rule [1] that is introduced recently: ``` Operators should not report Progressing only because DaemonSets owned by them are adjusting to a new node from cluster scaleup or a node rebooting from cluster upgrade. ``` The test plugs into the existing scaling test. It checks each CO's Progressing condition before and after the test, and identifies every CO that either left Progressing=False or re-entered Progressing=False with a different LastTransitionTime. [1]. https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164 --- test/extended/machines/scale.go | 40 ++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go index 695a6ce35816..58ebde800429 100644 --- a/test/extended/machines/scale.go +++ b/test/extended/machines/scale.go @@ -9,6 +9,8 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" + configv1 "github.com/openshift/api/config/v1" + configclient "github.com/openshift/client-go/config/clientset/versioned" bmhelper "github.com/openshift/origin/test/extended/baremetal" "github.com/stretchr/objx" corev1 "k8s.io/api/core/v1" @@ -186,12 +188,28 @@ func scaleMachineSet(name string, replicas int) error { return nil } +func getOperatorsNotProgressing(c configclient.Interface) map[string]metav1.Time { + operators, err := c.ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + result := map[string]metav1.Time{} + for _, operator := range operators.Items { + for _, condition := range operator.Status.Conditions { + if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionFalse { + result[operator.Name] = condition.LastTransitionTime + } + } + } + return result +} + var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cluster should", func() { var ( - c *kubernetes.Clientset - dc dynamic.Interface - helper *bmhelper.BaremetalTestHelper + c *kubernetes.Clientset + configClient configclient.Interface + dc dynamic.Interface + helper *bmhelper.BaremetalTestHelper + operatorsNotProgressing map[string]metav1.Time ) g.BeforeEach(func() { @@ -210,10 +228,26 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl helper.Setup() helper.DeployExtraWorker(0) } + + configClient, err = configclient.NewForConfig(cfg) + o.Expect(err).NotTo(o.HaveOccurred()) + operatorsNotProgressing = getOperatorsNotProgressing(configClient) }) g.AfterEach(func() { helper.DeleteAllExtraWorkers() + + // No cluster operator should leave Progressing=False only up to cluster scaling + // https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164 + operatorsNotProgressingAfter := getOperatorsNotProgressing(configClient) + var violations []string + for operator, t1 := range operatorsNotProgressing { + t2, ok := operatorsNotProgressingAfter[operator] + if !ok || t1.Unix() != t2.Unix() { + violations = append(violations, operator) + } + } + o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations) }) // The 30m timeout is essentially required by the baremetal platform environment, From c9c5fa5f89028e3587e8322145678692af68c6c7 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Thu, 25 Sep 2025 11:24:35 -0400 Subject: [PATCH 2/2] Add exceptions for the violating COs The bugs are created for the case of node rebooting. The condition goes to Progressing=True with the same reason that we found for the cluster scaling up/down. Thus, we re-use the bugs instead of recreating a new set of bugs that might be closed as duplciates. --- test/extended/machines/scale.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go index 58ebde800429..7e0dfa04aee9 100644 --- a/test/extended/machines/scale.go +++ b/test/extended/machines/scale.go @@ -237,13 +237,30 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl g.AfterEach(func() { helper.DeleteAllExtraWorkers() + except := func(co string) string { + switch co { + case "dns": + return "https://issues.redhat.com/browse/OCPBUGS-62623" + case "image-registry": + return "https://issues.redhat.com/browse/OCPBUGS-62626" + case "network": + return "https://issues.redhat.com/browse/OCPBUGS-62630" + case "node-tuning": + return "https://issues.redhat.com/browse/OCPBUGS-62632" + case "storage": + return "https://issues.redhat.com/browse/OCPBUGS-62633" + default: + return "" + } + } + // No cluster operator should leave Progressing=False only up to cluster scaling // https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164 operatorsNotProgressingAfter := getOperatorsNotProgressing(configClient) var violations []string for operator, t1 := range operatorsNotProgressing { t2, ok := operatorsNotProgressingAfter[operator] - if !ok || t1.Unix() != t2.Unix() { + if reason := except(operator); reason == "" && (!ok || t1.Unix() != t2.Unix()) { violations = append(violations, operator) } }