@@ -35,64 +35,81 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf
3535}
3636
3737func (w * metricsEndpointDown ) CollectData (ctx context.Context , storageDir string , beginning , end time.Time ) (monitorapi.Intervals , []* junitapi.JUnitTestCase , error ) {
38- intervals , err := buildIntervalsForMetricsEndpointsDown ( ctx , w . adminRESTConfig , beginning )
39- return intervals , nil , err
38+ // Don't return intervals here - we'll filter them in ConstructComputedIntervals
39+ return nil , nil , nil
4040}
4141
42- func (* metricsEndpointDown ) ConstructComputedIntervals (ctx context.Context , startingIntervals monitorapi.Intervals , recordedResources monitorapi.ResourcesMap , beginning , end time.Time ) (monitorapi.Intervals , error ) {
43- return nil , nil
44- }
45-
46- func (* metricsEndpointDown ) EvaluateTestsFromConstructedIntervals (ctx context.Context , finalIntervals monitorapi.Intervals ) ([]* junitapi.JUnitTestCase , error ) {
47- failures := []string {}
42+ func (w * metricsEndpointDown ) ConstructComputedIntervals (ctx context.Context , startingIntervals monitorapi.Intervals , recordedResources monitorapi.ResourcesMap , beginning , end time.Time ) (monitorapi.Intervals , error ) {
4843 logger := logrus .WithField ("MonitorTest" , "MetricsEndpointDown" )
49- metricsEndpointDownIntervals := finalIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
50- return eventInterval .Source == monitorapi .SourceMetricsEndpointDown
51- })
52- logger .Infof ("found %d metrics endpoint down intervals" , len (metricsEndpointDownIntervals ))
5344
54- // We know these endpoints go down both during node update, and obviously during reboot, ignore overlap
55- // with either:
56- nodeUpdateIntervals := finalIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
45+ // Query Prometheus for metrics endpoint down intervals
46+ metricsEndpointDownIntervals , err := buildIntervalsForMetricsEndpointsDown (ctx , w .adminRESTConfig , beginning )
47+ if err != nil {
48+ return nil , err
49+ }
50+ logger .Infof ("found %d metrics endpoint down intervals from Prometheus" , len (metricsEndpointDownIntervals ))
51+
52+ // Filter for node update and reboot intervals
53+ nodeUpdateIntervals := startingIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
5754 return (eventInterval .Source == monitorapi .SourceNodeState && eventInterval .Message .Annotations ["phase" ] == "Update" ) ||
5855 (eventInterval .Source == monitorapi .SourceNodeState && eventInterval .Message .Annotations ["phase" ] == "Reboot" )
5956 })
60- logger .Infof ("found %d node update intervals" , len (nodeUpdateIntervals ))
57+ logger .Infof ("found %d node update/reboot intervals" , len (nodeUpdateIntervals ))
6158
59+ // Filter out metrics endpoint down intervals that overlap with node updates/reboots
60+ filteredIntervals := monitorapi.Intervals {}
6261 for _ , downInterval := range metricsEndpointDownIntervals {
63- logger .Infof ("checking metrics down interval: %s" , downInterval )
6462 restartsForNodeIntervals := nodeUpdateIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
6563 return eventInterval .Locator .Keys [monitorapi .LocatorNodeKey ] == downInterval .Locator .Keys [monitorapi .LocatorNodeKey ]
6664 })
6765 overlapIntervals := utility .FindOverlap (restartsForNodeIntervals , downInterval )
6866 if len (overlapIntervals ) == 0 {
69- failures = append ( failures , downInterval . String ())
70- logger . Info ( "found no overlap with a node update" )
67+ // No overlap with node update/reboot - keep this interval
68+ filteredIntervals = append ( filteredIntervals , downInterval )
7169 } else {
72- logger .Infof ("found overlap with a node update: %s" , overlapIntervals [ 0 ] )
70+ logger .Infof ("filtering out metrics endpoint down interval due to overlap with node update/reboot : %s" , downInterval )
7371 }
7472 }
73+ logger .Infof ("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)" ,
74+ len (filteredIntervals ), len (metricsEndpointDownIntervals )- len (filteredIntervals ))
75+
76+ return filteredIntervals , nil
77+ }
78+
79+ func (* metricsEndpointDown ) EvaluateTestsFromConstructedIntervals (ctx context.Context , finalIntervals monitorapi.Intervals ) ([]* junitapi.JUnitTestCase , error ) {
80+ logger := logrus .WithField ("MonitorTest" , "MetricsEndpointDown" )
81+
82+ // Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals
83+ // to exclude overlaps with node updates/reboots
84+ metricsEndpointDownIntervals := finalIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
85+ return eventInterval .Source == monitorapi .SourceMetricsEndpointDown
86+ })
87+ logger .Infof ("evaluating %d metrics endpoint down intervals (already filtered)" , len (metricsEndpointDownIntervals ))
88+
7589 junits := []* junitapi.JUnitTestCase {}
76- if len (failures ) > 0 {
90+ if len (metricsEndpointDownIntervals ) > 0 {
91+ failures := []string {}
92+ for _ , downInterval := range metricsEndpointDownIntervals {
93+ failures = append (failures , downInterval .String ())
94+ }
7795 testOutput := fmt .Sprintf ("found prometheus reporting metrics endpoints down outside of a node update: \n %s" ,
7896 strings .Join (failures , "\n " ))
79- // This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit:
80- // Limit to kubelet service, all we're querying right now?
8197 junits = append (junits , & junitapi.JUnitTestCase {
8298 Name : testName ,
8399 FailureOutput : & junitapi.FailureOutput {
84100 Output : testOutput ,
85101 },
86102 })
87103 }
88- // Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild.
104+ // Add a success so this is marked as a flake at worst
89105 junits = append (junits , & junitapi.JUnitTestCase {
90106 Name : testName ,
91107 })
92108 return junits , nil
93109}
94110
95- func (* metricsEndpointDown ) WriteContentToStorage (ctx context.Context , storageDir , timeSuffix string , finalIntervals monitorapi.Intervals , finalResourceState monitorapi.ResourcesMap ) error {
111+ func (w * metricsEndpointDown ) WriteContentToStorage (ctx context.Context , storageDir , timeSuffix string , finalIntervals monitorapi.Intervals , finalResourceState monitorapi.ResourcesMap ) error {
112+ // No longer writing autodl files here - intervaldurationsum monitor test handles this
96113 return nil
97114}
98115
0 commit comments