@@ -145,9 +145,13 @@ func max(a, b int) int {
145145 return b
146146}
147147
148+ // permittedRetryImageTags defines which image tags are allowed for test retries
149+ // Don't add more here without discussion with OCP architects, we should be moving towards not having any flakes
150+ var permittedRetryImageTags = []string {"tests" } // tests = openshift-tests image
151+
148152// shouldRetryTest determines if a failed test should be retried based on retry policies.
149153// It returns true if the test is eligible for retry, false otherwise.
150- func shouldRetryTest (ctx context.Context , test * testCase , permittedRetryImageTags [] string ) bool {
154+ func shouldRetryTest (ctx context.Context , test * testCase ) bool {
151155 // Internal tests (no binary) are eligible for retry, we shouldn't really have any of these
152156 // now that origin is also an extension.
153157 if test .binary == nil {
@@ -178,6 +182,21 @@ func shouldRetryTest(ctx context.Context, test *testCase, permittedRetryImageTag
178182 return false
179183}
180184
185+ // getRetriableTestCases takes a slice of test cases and returns copies of the retriable ones
186+ // according to image tag rules.
187+ func getRetriableTestCases (ctx context.Context , tests []* testCase ) []* testCase {
188+ var retries []* testCase
189+
190+ for _ , test := range tests {
191+ if shouldRetryTest (ctx , test ) {
192+ retry := test .Retry ()
193+ retries = append (retries , retry )
194+ }
195+ }
196+
197+ return retries
198+ }
199+
181200func (o * GinkgoRunSuiteOptions ) Run (suite * TestSuite , clusterConfig * clusterdiscovery.ClusterConfiguration , junitSuiteName string , monitorTestInfo monitortestframework.MonitorTestInitializationInfo ,
182201 upgrade bool ) error {
183202 ctx := context .Background ()
@@ -506,72 +525,31 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
506525 }
507526
508527 // calculate the effective test set we ran, excluding any incompletes
509- tests , _ = splitTests (tests , func (t * testCase ) bool { return t .success || t .flake || t . failed || t .skipped })
528+ tests , _ = splitTests (tests , func (t * testCase ) bool { return t .success || t .failed || t .skipped })
510529
511530 end := time .Now ()
512531 duration := end .Sub (start ).Round (time .Second / 10 )
513532 if duration > time .Minute {
514533 duration = duration .Round (time .Second )
515534 }
516535
517- pass , fail , skip , failing := summarizeTests (tests )
518-
519- // Determine if we should retry any tests for flake detection
520- // Don't add more here without discussion with OCP architects, we should be moving towards not having any flakes
521- permittedRetryImageTags := []string {"tests" } // tests = openshift-tests image
522- if fail > 0 && fail <= suite .MaximumAllowedFlakes {
523- var retries []* testCase
524-
525- failedUnretriableTestCount := 0
526- for _ , test := range failing {
527- if shouldRetryTest (ctx , test , permittedRetryImageTags ) {
528- retry := test .Retry ()
529- retries = append (retries , retry )
530- if len (retries ) > suite .MaximumAllowedFlakes {
531- break
532- }
533- } else if test .binary != nil {
534- // Do not retry extension tests -- we also want to remove retries from origin-sourced
535- // tests, but extensions is where we can start.
536- failedUnretriableTestCount ++
537- }
538- }
539-
540- logrus .Warningf ("%d tests failed, %d tests permitted to be retried; %d failures are terminal non-retryable failures" , len (failing ), len (retries ), failedUnretriableTestCount )
536+ retries := getRetriableTestCases (ctx , tests )
537+ if len (retries ) <= suite .MaximumAllowedFlakes {
538+ logrus .Warningf ("%d failed tests to be retried" , len (retries ))
541539
542540 // Run the tests in the retries list.
543541 q := newParallelTestQueue (testRunnerContext )
544542 q .Execute (testCtx , retries , parallelism , testOutputConfig , abortFn )
545543
546- var flaky , skipped []string
547- var repeatFailures []* testCase
548- for _ , test := range retries {
549- if test .success {
550- flaky = append (flaky , test .name )
551- } else if test .skipped {
552- skipped = append (skipped , test .name )
553- } else {
554- repeatFailures = append (repeatFailures , test )
555- }
556- }
557-
558- // Add the list of retries into the list of all tests.
544+ // If a retry test got skipped, it means we very likely failed a precondition in the first failure, so
545+ // we need to remove the failure case
546+ var skipped []string
559547 for _ , retry := range retries {
560- if retry .flake {
561- // Retry tests that flaked are omitted so that the original test is counted as a failure.
562- fmt .Fprintf (o .Out , "Ignoring retry that returned a flake, original failure is authoritative for test: %s\n " , retry .name )
563- continue
548+ if retry .skipped {
549+ skipped = append (skipped , retry .name )
564550 }
565- tests = append (tests , retry )
566- }
567- if len (flaky ) > 0 {
568- failing = repeatFailures
569- sort .Strings (flaky )
570- fmt .Fprintf (o .Out , "Flaky tests:\n \n %s\n \n " , strings .Join (flaky , "\n " ))
571551 }
572552 if len (skipped ) > 0 {
573- // If a retry test got skipped, it means we very likely failed a precondition in the first failure, so
574- // we need to remove the failure case.
575553 var withoutPreconditionFailures []* testCase
576554 testLoop:
577555 for _ , t := range tests {
@@ -583,12 +561,28 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
583561 }
584562 }
585563 tests = withoutPreconditionFailures
586- failing = repeatFailures
587564 sort .Strings (skipped )
588565 fmt .Fprintf (o .Out , "Skipped tests that failed a precondition:\n \n %s\n \n " , strings .Join (skipped , "\n " ))
566+ }
589567
568+ // Append our retry results to the final set
569+ tests = append (tests , retries ... )
570+ }
571+
572+ // Calculate test results after retries
573+ result := summarizeTests (tests )
574+ pass , fail , skip , flake := len (result .passed ), len (result .failed ), len (result .skipped ), len (result .flaked )
575+ logrus .Infof ("%d pass, %d fail, %d skip, %d flake" , pass , fail , skip , flake )
576+
577+ // Extract flaky test names for logging
578+ var flakingTestNames []string
579+ for _ , flakedGroup := range result .flaked {
580+ if len (flakedGroup ) > 0 {
581+ flakingTestNames = append (flakingTestNames , flakedGroup [0 ].name )
590582 }
591583 }
584+ logrus .Infof ("Flaky tests: %v" , strings .Join (flakingTestNames , "\n " ))
585+ logrus .Infof ("Failing tests: %v" , strings .Join (testNames (result .failed ), "\n " ))
592586
593587 // monitor the cluster while the tests are running and report any detected anomalies
594588 var syntheticTestResults []* junitapi.JUnitTestCase
@@ -661,8 +655,8 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
661655 }
662656
663657 // report the outcome of the test
664- if len (failing ) > 0 {
665- names := sets .NewString (testNames (failing )... ).List ()
658+ if len (result . failed ) > 0 {
659+ names := sets .NewString (testNames (result . failed )... ).List ()
666660 fmt .Fprintf (o .Out , "Failing tests:\n \n %s\n \n " , strings .Join (names , "\n " ))
667661 }
668662
@@ -681,11 +675,18 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
681675 }
682676 }
683677
684- if fail > 0 {
685- if len (failing ) > 0 || suite .MaximumAllowedFlakes == 0 {
686- return fmt .Errorf ("%d fail, %d pass, %d skip (%s)" , fail , pass , skip , duration )
687- }
688- fmt .Fprintf (o .Out , "%d flakes detected, suite allows passing with only flakes\n \n " , fail )
678+ // Check for hard failures first - these always cause suite failure
679+ if fail > 0 && result .allFailuresBlocking () {
680+ logrus .Errorf ("Suite failing due to %d hard failures" , fail )
681+ return fmt .Errorf ("%d fail, %d pass, %d skip, %d flake (%s)" , fail , pass , skip , flake , duration )
682+ } else if fail > 0 {
683+ logrus .Warningf ("All %d test failures are not blocking" , fail )
684+ }
685+
686+ // Check for flakes - these only cause failure if MaximumAllowedFlakes is 0
687+ if flake > 0 && suite .MaximumAllowedFlakes == 0 {
688+ logrus .Errorf ("Suite failing because MaximumAllowedFlakes is 0 but %d tests flaked" , flake )
689+ return fmt .Errorf ("%d flake, %d pass, %d skip (%s)" , flake , pass , skip , duration )
689690 }
690691
691692 if syntheticFailure {
@@ -695,7 +696,11 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
695696 return fmt .Errorf ("failed due to a MonitorTest failure" )
696697 }
697698
698- fmt .Fprintf (o .Out , "%d pass, %d skip (%s)\n " , pass , skip , duration )
699+ if flake > 0 {
700+ fmt .Fprintf (o .Out , "%d pass, %d skip, %d flake (%s)\n " , pass , skip , flake , duration )
701+ } else {
702+ fmt .Fprintf (o .Out , "%d pass, %d skip (%s)\n " , pass , skip , duration )
703+ }
699704 return ctx .Err ()
700705}
701706
0 commit comments