Skip to content

Commit f07be90

Browse files
Merge pull request #30593 from dgoodwin/kubelet-metrics-total-outage
NO-JIRA: Track the total kubelet metrics outage durations with autodl framework
2 parents 0d04a81 + 63129f5 commit f07be90

File tree

3 files changed

+155
-25
lines changed

3 files changed

+155
-25
lines changed

pkg/defaultmonitortests/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"github.com/openshift/origin/pkg/monitortests/testframework/etcddiskmetricsintervals"
5656
"github.com/openshift/origin/pkg/monitortests/testframework/highcputestanalyzer"
5757

58+
"github.com/openshift/origin/pkg/monitortests/testframework/intervaldurationsum"
5859
"github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer"
5960
"github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker"
6061
"github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests"
@@ -134,6 +135,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
134135

135136
monitorTestRegistry.AddMonitorTestOrDie("alert-summary-serializer", "Test Framework", alertanalyzer.NewAlertSummarySerializer())
136137
monitorTestRegistry.AddMonitorTestOrDie("metrics-endpoints-down", "Test Framework", metricsendpointdown.NewMetricsEndpointDown())
138+
monitorTestRegistry.AddMonitorTestOrDie("interval-duration-sum", "Test Framework", intervaldurationsum.NewIntervalDurationSum())
137139
monitorTestRegistry.AddMonitorTestOrDie("external-service-availability", "Test Framework", disruptionexternalservicemonitoring.NewAvailabilityInvariant())
138140
monitorTestRegistry.AddMonitorTestOrDie("external-gcp-cloud-service-availability", "Test Framework", disruptionexternalgcpcloudservicemonitoring.NewCloudAvailabilityInvariant())
139141
monitorTestRegistry.AddMonitorTestOrDie("external-aws-cloud-service-availability", "Test Framework", disruptionexternalawscloudservicemonitoring.NewCloudAvailabilityInvariant())
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package intervaldurationsum
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"path/filepath"
7+
"time"
8+
9+
"github.com/openshift/origin/pkg/dataloader"
10+
"github.com/openshift/origin/pkg/monitortestframework"
11+
"github.com/sirupsen/logrus"
12+
13+
"github.com/openshift/origin/pkg/monitor/monitorapi"
14+
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
15+
"k8s.io/client-go/rest"
16+
)
17+
18+
// intervalDurationSum is a monitor test that sums the total duration of intervals
19+
// matching specific sources and writes the results to an autodl file.
20+
//
21+
// The generated autodl file will have the following schema:
22+
// - IntervalSource (string): The source type of the intervals
23+
// - TotalDurationSeconds (int): Sum of all interval durations in seconds for that source
24+
//
25+
// The autodl file will be named: interval_duration_sum{timeSuffix}-autodl.json
26+
type intervalDurationSum struct {
27+
adminRESTConfig *rest.Config
28+
}
29+
30+
// NewIntervalDurationSum creates a monitor test that sums the total duration of intervals
31+
// for specific sources and writes the results to an autodl file.
32+
func NewIntervalDurationSum() monitortestframework.MonitorTest {
33+
return &intervalDurationSum{}
34+
}
35+
36+
func (w *intervalDurationSum) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
37+
return nil
38+
}
39+
40+
func (w *intervalDurationSum) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
41+
w.adminRESTConfig = adminRESTConfig
42+
return nil
43+
}
44+
45+
func (w *intervalDurationSum) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
46+
return nil, nil, nil
47+
}
48+
49+
func (w *intervalDurationSum) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
50+
return nil, nil
51+
}
52+
53+
func (w *intervalDurationSum) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
54+
return nil, nil
55+
}
56+
57+
func (w *intervalDurationSum) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
58+
logger := logrus.WithField("MonitorTest", "IntervalDurationSum")
59+
60+
// Define the interval sources to track
61+
sourcesToTrack := []monitorapi.IntervalSource{
62+
monitorapi.SourceMetricsEndpointDown,
63+
monitorapi.SourceCPUMonitor,
64+
}
65+
66+
// Calculate total duration for each source
67+
rows := []map[string]string{}
68+
for _, source := range sourcesToTrack {
69+
matchingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
70+
return eventInterval.Source == source
71+
})
72+
73+
var totalDurationSeconds int
74+
for _, interval := range matchingIntervals {
75+
duration := int(interval.To.Sub(interval.From).Seconds())
76+
totalDurationSeconds += duration
77+
}
78+
79+
logger.Infof("Total duration for source %s: %d seconds across %d intervals", source, totalDurationSeconds, len(matchingIntervals))
80+
81+
rows = append(rows, map[string]string{
82+
"IntervalSource": string(source),
83+
"TotalDurationSeconds": fmt.Sprintf("%d", totalDurationSeconds),
84+
})
85+
}
86+
87+
// Create autodl artifact with total durations per source
88+
dataFile := dataloader.DataFile{
89+
TableName: "interval_duration_sum",
90+
Schema: map[string]dataloader.DataType{
91+
"IntervalSource": dataloader.DataTypeString,
92+
"TotalDurationSeconds": dataloader.DataTypeInteger,
93+
},
94+
Rows: rows,
95+
}
96+
97+
// Create the file name using the autodl suffix
98+
fileName := filepath.Join(storageDir, fmt.Sprintf("interval-duration-sum%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix))
99+
100+
// Write the data file
101+
err := dataloader.WriteDataFile(fileName, dataFile)
102+
if err != nil {
103+
logger.WithError(err).Warnf("unable to write data file: %s", fileName)
104+
}
105+
106+
return nil
107+
}
108+
109+
func (w *intervalDurationSum) Cleanup(ctx context.Context) error {
110+
return nil
111+
}

pkg/monitortests/testframework/metricsendpointdown/monitortest.go

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,64 +35,81 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf
3535
}
3636

3737
func (w *metricsEndpointDown) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
38-
intervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
39-
return intervals, nil, err
38+
// Don't return intervals here - we'll filter them in ConstructComputedIntervals
39+
return nil, nil, nil
4040
}
4141

42-
func (*metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
43-
return nil, nil
44-
}
45-
46-
func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
47-
failures := []string{}
42+
func (w *metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
4843
logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
49-
metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
50-
return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
51-
})
52-
logger.Infof("found %d metrics endpoint down intervals", len(metricsEndpointDownIntervals))
5344

54-
// We know these endpoints go down both during node update, and obviously during reboot, ignore overlap
55-
// with either:
56-
nodeUpdateIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
45+
// Query Prometheus for metrics endpoint down intervals
46+
metricsEndpointDownIntervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
47+
if err != nil {
48+
return nil, err
49+
}
50+
logger.Infof("found %d metrics endpoint down intervals from Prometheus", len(metricsEndpointDownIntervals))
51+
52+
// Filter for node update and reboot intervals
53+
nodeUpdateIntervals := startingIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
5754
return (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Update") ||
5855
(eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Reboot")
5956
})
60-
logger.Infof("found %d node update intervals", len(nodeUpdateIntervals))
57+
logger.Infof("found %d node update/reboot intervals", len(nodeUpdateIntervals))
6158

59+
// Filter out metrics endpoint down intervals that overlap with node updates/reboots
60+
filteredIntervals := monitorapi.Intervals{}
6261
for _, downInterval := range metricsEndpointDownIntervals {
63-
logger.Infof("checking metrics down interval: %s", downInterval)
6462
restartsForNodeIntervals := nodeUpdateIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
6563
return eventInterval.Locator.Keys[monitorapi.LocatorNodeKey] == downInterval.Locator.Keys[monitorapi.LocatorNodeKey]
6664
})
6765
overlapIntervals := utility.FindOverlap(restartsForNodeIntervals, downInterval)
6866
if len(overlapIntervals) == 0 {
69-
failures = append(failures, downInterval.String())
70-
logger.Info("found no overlap with a node update")
67+
// No overlap with node update/reboot - keep this interval
68+
filteredIntervals = append(filteredIntervals, downInterval)
7169
} else {
72-
logger.Infof("found overlap with a node update: %s", overlapIntervals[0])
70+
logger.Infof("filtering out metrics endpoint down interval due to overlap with node update/reboot: %s", downInterval)
7371
}
7472
}
73+
logger.Infof("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)",
74+
len(filteredIntervals), len(metricsEndpointDownIntervals)-len(filteredIntervals))
75+
76+
return filteredIntervals, nil
77+
}
78+
79+
func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
80+
logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
81+
82+
// Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals
83+
// to exclude overlaps with node updates/reboots
84+
metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
85+
return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
86+
})
87+
logger.Infof("evaluating %d metrics endpoint down intervals (already filtered)", len(metricsEndpointDownIntervals))
88+
7589
junits := []*junitapi.JUnitTestCase{}
76-
if len(failures) > 0 {
90+
if len(metricsEndpointDownIntervals) > 0 {
91+
failures := []string{}
92+
for _, downInterval := range metricsEndpointDownIntervals {
93+
failures = append(failures, downInterval.String())
94+
}
7795
testOutput := fmt.Sprintf("found prometheus reporting metrics endpoints down outside of a node update: \n %s",
7896
strings.Join(failures, "\n "))
79-
// This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit:
80-
// Limit to kubelet service, all we're querying right now?
8197
junits = append(junits, &junitapi.JUnitTestCase{
8298
Name: testName,
8399
FailureOutput: &junitapi.FailureOutput{
84100
Output: testOutput,
85101
},
86102
})
87103
}
88-
// Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild.
104+
// Add a success so this is marked as a flake at worst
89105
junits = append(junits, &junitapi.JUnitTestCase{
90106
Name: testName,
91107
})
92108
return junits, nil
93109
}
94110

95-
func (*metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
111+
func (w *metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
112+
// No longer writing autodl files here - intervaldurationsum monitor test handles this
96113
return nil
97114
}
98115

0 commit comments

Comments
 (0)