Merge pull request #30593 from dgoodwin/kubelet-metrics-total-outage

openshift-merge-bot[bot] · web-flow · commit f07be9001466 · 2026-01-08T19:03:46.000Z
NO-JIRA: Track the total kubelet metrics outage durations with autodl framework
diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go
@@ -55,6 +55,7 @@ import (
 	"github.com/openshift/origin/pkg/monitortests/testframework/etcddiskmetricsintervals"
 	"github.com/openshift/origin/pkg/monitortests/testframework/highcputestanalyzer"
 
+	"github.com/openshift/origin/pkg/monitortests/testframework/intervaldurationsum"
 	"github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer"
 	"github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker"
 	"github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests"
@@ -134,6 +135,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
 
 	monitorTestRegistry.AddMonitorTestOrDie("alert-summary-serializer", "Test Framework", alertanalyzer.NewAlertSummarySerializer())
 	monitorTestRegistry.AddMonitorTestOrDie("metrics-endpoints-down", "Test Framework", metricsendpointdown.NewMetricsEndpointDown())
+	monitorTestRegistry.AddMonitorTestOrDie("interval-duration-sum", "Test Framework", intervaldurationsum.NewIntervalDurationSum())
 	monitorTestRegistry.AddMonitorTestOrDie("external-service-availability", "Test Framework", disruptionexternalservicemonitoring.NewAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie("external-gcp-cloud-service-availability", "Test Framework", disruptionexternalgcpcloudservicemonitoring.NewCloudAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie("external-aws-cloud-service-availability", "Test Framework", disruptionexternalawscloudservicemonitoring.NewCloudAvailabilityInvariant())
diff --git a/pkg/monitortests/testframework/intervaldurationsum/monitortest.go b/pkg/monitortests/testframework/intervaldurationsum/monitortest.go
@@ -0,0 +1,111 @@
+package intervaldurationsum
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"time"
+
+	"github.com/openshift/origin/pkg/dataloader"
+	"github.com/openshift/origin/pkg/monitortestframework"
+	"github.com/sirupsen/logrus"
+
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
+	"k8s.io/client-go/rest"
+)
+
+// intervalDurationSum is a monitor test that sums the total duration of intervals
+// matching specific sources and writes the results to an autodl file.
+//
+// The generated autodl file will have the following schema:
+//   - IntervalSource (string): The source type of the intervals
+//   - TotalDurationSeconds (int): Sum of all interval durations in seconds for that source
+//
+// The autodl file will be named: interval_duration_sum{timeSuffix}-autodl.json
+type intervalDurationSum struct {
+	adminRESTConfig *rest.Config
+}
+
+// NewIntervalDurationSum creates a monitor test that sums the total duration of intervals
+// for specific sources and writes the results to an autodl file.
+func NewIntervalDurationSum() monitortestframework.MonitorTest {
+	return &intervalDurationSum{}
+}
+
+func (w *intervalDurationSum) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	return nil
+}
+
+func (w *intervalDurationSum) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	w.adminRESTConfig = adminRESTConfig
+	return nil
+}
+
+func (w *intervalDurationSum) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
+	return nil, nil, nil
+}
+
+func (w *intervalDurationSum) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
+	return nil, nil
+}
+
+func (w *intervalDurationSum) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	return nil, nil
+}
+
+func (w *intervalDurationSum) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	logger := logrus.WithField("MonitorTest", "IntervalDurationSum")
+
+	// Define the interval sources to track
+	sourcesToTrack := []monitorapi.IntervalSource{
+		monitorapi.SourceMetricsEndpointDown,
+		monitorapi.SourceCPUMonitor,
+	}
+
+	// Calculate total duration for each source
+	rows := []map[string]string{}
+	for _, source := range sourcesToTrack {
+		matchingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+			return eventInterval.Source == source
+		})
+
+		var totalDurationSeconds int
+		for _, interval := range matchingIntervals {
+			duration := int(interval.To.Sub(interval.From).Seconds())
+			totalDurationSeconds += duration
+		}
+
+		logger.Infof("Total duration for source %s: %d seconds across %d intervals", source, totalDurationSeconds, len(matchingIntervals))
+
+		rows = append(rows, map[string]string{
+			"IntervalSource":       string(source),
+			"TotalDurationSeconds": fmt.Sprintf("%d", totalDurationSeconds),
+		})
+	}
+
+	// Create autodl artifact with total durations per source
+	dataFile := dataloader.DataFile{
+		TableName: "interval_duration_sum",
+		Schema: map[string]dataloader.DataType{
+			"IntervalSource":       dataloader.DataTypeString,
+			"TotalDurationSeconds": dataloader.DataTypeInteger,
+		},
+		Rows: rows,
+	}
+
+	// Create the file name using the autodl suffix
+	fileName := filepath.Join(storageDir, fmt.Sprintf("interval-duration-sum%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix))
+
+	// Write the data file
+	err := dataloader.WriteDataFile(fileName, dataFile)
+	if err != nil {
+		logger.WithError(err).Warnf("unable to write data file: %s", fileName)
+	}
+
+	return nil
+}
+
+func (w *intervalDurationSum) Cleanup(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/monitortests/testframework/metricsendpointdown/monitortest.go b/pkg/monitortests/testframework/metricsendpointdown/monitortest.go
@@ -35,64 +35,81 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf
 }
 
 func (w *metricsEndpointDown) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
-	intervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
-	return intervals, nil, err
+	// Don't return intervals here - we'll filter them in ConstructComputedIntervals
+	return nil, nil, nil
 }
 
-func (*metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
-	return nil, nil
-}
-
-func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
-	failures := []string{}
+func (w *metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
 	logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
-	metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
-		return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
-	})
-	logger.Infof("found %d metrics endpoint down intervals", len(metricsEndpointDownIntervals))
 
-	// We know these endpoints go down both during node update, and obviously during reboot, ignore overlap
-	// with either:
-	nodeUpdateIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+	// Query Prometheus for metrics endpoint down intervals
+	metricsEndpointDownIntervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
+	if err != nil {
+		return nil, err
+	}
+	logger.Infof("found %d metrics endpoint down intervals from Prometheus", len(metricsEndpointDownIntervals))
+
+	// Filter for node update and reboot intervals
+	nodeUpdateIntervals := startingIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
 		return (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Update") ||
 			(eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Reboot")
 	})
-	logger.Infof("found %d node update intervals", len(nodeUpdateIntervals))
+	logger.Infof("found %d node update/reboot intervals", len(nodeUpdateIntervals))
 
+	// Filter out metrics endpoint down intervals that overlap with node updates/reboots
+	filteredIntervals := monitorapi.Intervals{}
 	for _, downInterval := range metricsEndpointDownIntervals {
-		logger.Infof("checking metrics down interval: %s", downInterval)
 		restartsForNodeIntervals := nodeUpdateIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
 			return eventInterval.Locator.Keys[monitorapi.LocatorNodeKey] == downInterval.Locator.Keys[monitorapi.LocatorNodeKey]
 		})
 		overlapIntervals := utility.FindOverlap(restartsForNodeIntervals, downInterval)
 		if len(overlapIntervals) == 0 {
-			failures = append(failures, downInterval.String())
-			logger.Info("found no overlap with a node update")
+			// No overlap with node update/reboot - keep this interval
+			filteredIntervals = append(filteredIntervals, downInterval)
 		} else {
-			logger.Infof("found overlap with a node update: %s", overlapIntervals[0])
+			logger.Infof("filtering out metrics endpoint down interval due to overlap with node update/reboot: %s", downInterval)
 		}
 	}
+	logger.Infof("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)",
+		len(filteredIntervals), len(metricsEndpointDownIntervals)-len(filteredIntervals))
+
+	return filteredIntervals, nil
+}
+
+func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
+
+	// Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals
+	// to exclude overlaps with node updates/reboots
+	metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+		return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
+	})
+	logger.Infof("evaluating %d metrics endpoint down intervals (already filtered)", len(metricsEndpointDownIntervals))
+
 	junits := []*junitapi.JUnitTestCase{}
-	if len(failures) > 0 {
+	if len(metricsEndpointDownIntervals) > 0 {
+		failures := []string{}
+		for _, downInterval := range metricsEndpointDownIntervals {
+			failures = append(failures, downInterval.String())
+		}
 		testOutput := fmt.Sprintf("found prometheus reporting metrics endpoints down outside of a node update: \n  %s",
 			strings.Join(failures, "\n  "))
-		// This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit:
-		// Limit to kubelet service, all we're querying right now?
 		junits = append(junits, &junitapi.JUnitTestCase{
 			Name: testName,
 			FailureOutput: &junitapi.FailureOutput{
 				Output: testOutput,
 			},
 		})
 	}
-	// Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild.
+	// Add a success so this is marked as a flake at worst
 	junits = append(junits, &junitapi.JUnitTestCase{
 		Name: testName,
 	})
 	return junits, nil
 }
 
-func (*metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+func (w *metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	// No longer writing autodl files here - intervaldurationsum monitor test handles this
 	return nil
 }