Skip to content

Commit b53c4b6

Browse files
authored
Merge pull request #30143 from dgoodwin/drop-etcd-took-too-long-test
OCPBUGS-52968: Drop the etcd should not log excessive took too long messages test
2 parents a73e3db + 0d7b97d commit b53c4b6

File tree

2 files changed

+0
-40
lines changed

2 files changed

+0
-40
lines changed

pkg/monitortests/etcd/legacyetcdmonitortests/etcd.go

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -80,45 +80,6 @@ func testEtcdShouldNotLogDroppedRaftMessages(events monitorapi.Intervals) []*jun
8080
return []*junitapi.JUnitTestCase{failure, success}
8181
}
8282

83-
// etcdTookTooLongMaxRatePerFourHours is the max rate of messages allowed over a 4-hour period.
84-
// This replaces the fixed limit approach with a rate-based approach.
85-
// Virtually all jobs log these messages at some point, we're just interested in the ones that do so excessively.
86-
const etcdTookTooLongMaxRatePerFourHours = 12000
87-
88-
func testEtcdDoesNotLogExcessiveTookTooLongMessages(events monitorapi.Intervals, startTime time.Time) []*junitapi.JUnitTestCase {
89-
const testName = "[sig-etcd] etcd should not log excessive took too long messages"
90-
success := &junitapi.JUnitTestCase{Name: testName}
91-
92-
counter := 0
93-
for _, event := range events {
94-
if event.Source == monitorapi.SourceEtcdLog &&
95-
strings.Contains(event.Message.HumanMessage, "took too long") {
96-
counter++
97-
}
98-
}
99-
100-
maxAllowedCount := calculateRateBasedLimit(startTime, etcdTookTooLongMaxRatePerFourHours)
101-
actualDuration := time.Since(startTime)
102-
103-
if counter <= maxAllowedCount {
104-
return []*junitapi.JUnitTestCase{success}
105-
}
106-
107-
msg := fmt.Sprintf("Etcd logged %d 'took too long' messages in %v, exceeding the rate-based limit of %d "+
108-
"(based on max rate of %d messages per 4 hours). This is a strong indicator that etcd was very unhealthy "+
109-
"throughout the run. This can cause sporadic e2e failures and disruption and typically indicates faster "+
110-
"disks are needed. These log message intervals are included in spyglass chart artifacts and can be used "+
111-
"to correlate with disruption and failed tests.",
112-
counter, actualDuration.Round(time.Minute), maxAllowedCount, etcdTookTooLongMaxRatePerFourHours)
113-
failure := &junitapi.JUnitTestCase{
114-
Name: testName,
115-
FailureOutput: &junitapi.FailureOutput{
116-
Output: msg,
117-
},
118-
}
119-
return []*junitapi.JUnitTestCase{failure}
120-
}
121-
12283
// etcdOverloadedNetworkMaxRatePerFourHours uses the same rate-based approach for overloaded network messages.
12384
// We use the same rate limit as the "took too long" messages since both indicate severe etcd health issues.
12485
//

pkg/monitortests/etcd/legacyetcdmonitortests/monitortest.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
7171
junits = append(junits, testEtcdShouldNotLogSlowFdataSyncs(finalIntervals)...)
7272
junits = append(junits, testEtcdShouldNotLogDroppedRaftMessages(finalIntervals)...)
7373
junits = append(junits, testOperatorStatusChanged(finalIntervals)...)
74-
junits = append(junits, testEtcdDoesNotLogExcessiveTookTooLongMessages(finalIntervals, w.startTime)...)
7574
junits = append(junits, testEtcdDoesNotLogExcessiveOverloadedNetworkMessages(finalIntervals, w.startTime)...)
7675

7776
return junits, nil

0 commit comments

Comments
 (0)