@@ -80,45 +80,6 @@ func testEtcdShouldNotLogDroppedRaftMessages(events monitorapi.Intervals) []*jun
8080 return []* junitapi.JUnitTestCase {failure , success }
8181}
8282
83- // etcdTookTooLongMaxRatePerFourHours is the max rate of messages allowed over a 4-hour period.
84- // This replaces the fixed limit approach with a rate-based approach.
85- // Virtually all jobs log these messages at some point, we're just interested in the ones that do so excessively.
86- const etcdTookTooLongMaxRatePerFourHours = 12000
87-
88- func testEtcdDoesNotLogExcessiveTookTooLongMessages (events monitorapi.Intervals , startTime time.Time ) []* junitapi.JUnitTestCase {
89- const testName = "[sig-etcd] etcd should not log excessive took too long messages"
90- success := & junitapi.JUnitTestCase {Name : testName }
91-
92- counter := 0
93- for _ , event := range events {
94- if event .Source == monitorapi .SourceEtcdLog &&
95- strings .Contains (event .Message .HumanMessage , "took too long" ) {
96- counter ++
97- }
98- }
99-
100- maxAllowedCount := calculateRateBasedLimit (startTime , etcdTookTooLongMaxRatePerFourHours )
101- actualDuration := time .Since (startTime )
102-
103- if counter <= maxAllowedCount {
104- return []* junitapi.JUnitTestCase {success }
105- }
106-
107- msg := fmt .Sprintf ("Etcd logged %d 'took too long' messages in %v, exceeding the rate-based limit of %d " +
108- "(based on max rate of %d messages per 4 hours). This is a strong indicator that etcd was very unhealthy " +
109- "throughout the run. This can cause sporadic e2e failures and disruption and typically indicates faster " +
110- "disks are needed. These log message intervals are included in spyglass chart artifacts and can be used " +
111- "to correlate with disruption and failed tests." ,
112- counter , actualDuration .Round (time .Minute ), maxAllowedCount , etcdTookTooLongMaxRatePerFourHours )
113- failure := & junitapi.JUnitTestCase {
114- Name : testName ,
115- FailureOutput : & junitapi.FailureOutput {
116- Output : msg ,
117- },
118- }
119- return []* junitapi.JUnitTestCase {failure }
120- }
121-
12283// etcdOverloadedNetworkMaxRatePerFourHours uses the same rate-based approach for overloaded network messages.
12384// We use the same rate limit as the "took too long" messages since both indicate severe etcd health issues.
12485//
0 commit comments