feat: metrics improvements to op-acceptor (#503)

jelias2 · web-flow · commit 2256f8a9e6eb · 2025-10-30T23:02:41.000-04:00
diff --git a/op-acceptor/metrics/metrics.go b/op-acceptor/metrics/metrics.go
@@ -107,6 +107,9 @@ var (
 		Help:      "Total number of tests run (aggregate counter without run_id)",
 	}, []string{
 		"network_name",
+		"test_name",
+		"gate",
+		"suite",
 	})
 
 	testsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -115,6 +118,9 @@ var (
 		Help:      "Total number of passed tests (aggregate counter without run_id)",
 	}, []string{
 		"network_name",
+		"test_name",
+		"gate",
+		"suite",
 	})
 
 	testsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -123,6 +129,9 @@ var (
 		Help:      "Total number of failed tests (aggregate counter without run_id)",
 	}, []string{
 		"network_name",
+		"test_name",
+		"gate",
+		"suite",
 	})
 
 	testsSkipped = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -131,6 +140,9 @@ var (
 		Help:      "Total number of skipped tests (aggregate counter without run_id)",
 	}, []string{
 		"network_name",
+		"test_name",
+		"gate",
+		"suite",
 	})
 
 	// Metrics for individual test tracking
@@ -157,6 +169,101 @@ var (
 		"gate",
 		"suite",
 	})
+
+	// Test duration histogram to track distribution of test execution times
+	testDurationHistogram = promauto.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace: MetricsNamespace,
+		Name:      "test_duration_histogram_seconds",
+		Help:      "Histogram of test execution durations in seconds",
+		Buckets:   []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600}, // 100ms to 10min
+	}, []string{
+		"network_name",
+		"test_name",
+		"gate",
+		"suite",
+	})
+
+	// Test timeout tracking
+	testTimeouts = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "test_timeouts_total",
+		Help:      "Total number of tests that timed out",
+	}, []string{
+		"network_name",
+		"run_id",
+		"test_name",
+		"gate",
+		"suite",
+	})
+
+	// Gate-level aggregated metrics
+	gateTestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "gate_tests_total",
+		Help:      "Total number of tests per gate",
+	}, []string{
+		"network_name",
+		"gate",
+	})
+
+	gateTestsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "gate_tests_passed_total",
+		Help:      "Total number of passed tests per gate",
+	}, []string{
+		"network_name",
+		"gate",
+	})
+
+	gateTestsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "gate_tests_failed_total",
+		Help:      "Total number of failed tests per gate",
+	}, []string{
+		"network_name",
+		"gate",
+	})
+
+	gateDurationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: MetricsNamespace,
+		Name:      "gate_duration_seconds",
+		Help:      "Duration of gate execution in seconds",
+	}, []string{
+		"network_name",
+		"run_id",
+		"gate",
+	})
+
+	// Suite-level metrics
+	suiteTestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "suite_tests_total",
+		Help:      "Total number of tests per suite",
+	}, []string{
+		"network_name",
+		"gate",
+		"suite",
+	})
+
+	suiteTestsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "suite_tests_passed_total",
+		Help:      "Total number of passed tests per suite",
+	}, []string{
+		"network_name",
+		"gate",
+		"suite",
+	})
+
+	suiteTestsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: MetricsNamespace,
+		Name:      "suite_tests_failed_total",
+		Help:      "Total number of failed tests per suite",
+	}, []string{
+		"network_name",
+		"gate",
+		"suite",
+	})
 )
 
 // errToLabel tries to make the error string a more valid Prometheus label
@@ -232,14 +339,6 @@ func RecordAcceptance(
 	}
 
 	testRunDurationSeconds.WithLabelValues(network, runID).Set(duration.Seconds())
-
-	// Also record to the continuous counters without run_id
-	testsTotal.WithLabelValues(network).Add(float64(total))
-	testsPassed.WithLabelValues(network).Add(float64(passed))
-	testsFailed.WithLabelValues(network).Add(float64(failed))
-	if skipped > 0 {
-		testsSkipped.WithLabelValues(network).Add(float64(skipped))
-	}
 }
 
 // RecordIndividualTest records metrics for an individual test
@@ -265,8 +364,44 @@ func RecordIndividualTest(
 
 	testStatus.WithLabelValues(network, runID, testName, gate, suite).Set(statusValue)
 	testDurationSeconds.WithLabelValues(network, runID, testName, gate, suite).Set(duration.Seconds())
+
+	// Also record to the continuous counters without run_id for time-based aggregation
+	testsTotal.WithLabelValues(network, testName, gate, suite).Inc()
+	switch status {
+	case types.TestStatusPass:
+		testsPassed.WithLabelValues(network, testName, gate, suite).Inc()
+	case types.TestStatusFail:
+		testsFailed.WithLabelValues(network, testName, gate, suite).Inc()
+	case types.TestStatusSkip:
+		testsSkipped.WithLabelValues(network, testName, gate, suite).Inc()
+	}
 }
 
 func isValidResult(result types.TestStatus) bool {
 	return slices.Contains(validResults, result)
 }
+
+// RecordTestDurationHistogram records test duration in a histogram for distribution analysis
+func RecordTestDurationHistogram(network string, testName string, gate string, suite string, duration time.Duration) {
+	testDurationHistogram.WithLabelValues(network, testName, gate, suite).Observe(duration.Seconds())
+}
+
+// RecordTestTimeout records when a test times out
+func RecordTestTimeout(network string, runID string, testName string, gate string, suite string) {
+	testTimeouts.WithLabelValues(network, runID, testName, gate, suite).Inc()
+}
+
+// RecordGateMetrics records aggregated metrics for a gate
+func RecordGateMetrics(network string, runID string, gate string, total int, passed int, failed int, duration time.Duration) {
+	gateTestsTotal.WithLabelValues(network, gate).Add(float64(total))
+	gateTestsPassed.WithLabelValues(network, gate).Add(float64(passed))
+	gateTestsFailed.WithLabelValues(network, gate).Add(float64(failed))
+	gateDurationSeconds.WithLabelValues(network, runID, gate).Set(duration.Seconds())
+}
+
+// RecordSuiteMetrics records aggregated metrics for a suite
+func RecordSuiteMetrics(network string, gate string, suite string, total int, passed int, failed int) {
+	suiteTestsTotal.WithLabelValues(network, gate, suite).Add(float64(total))
+	suiteTestsPassed.WithLabelValues(network, gate, suite).Add(float64(passed))
+	suiteTestsFailed.WithLabelValues(network, gate, suite).Add(float64(failed))
+}
diff --git a/op-acceptor/nat.go b/op-acceptor/nat.go
@@ -577,10 +577,25 @@ func (n *nat) runTests(ctx context.Context) error {
 		n.result.Duration,
 	)
 
-	// Record metrics for individual tests
+	// Record metrics for individual tests and aggregated gate/suite metrics
 	for _, gate := range n.result.Gates {
+		// Calculate gate-level aggregates
+		gateTotal := 0
+		gatePassed := 0
+		gateFailed := 0
+		var gateDuration time.Duration
+
 		// Record direct gate tests
 		for testName, test := range gate.Tests {
+			gateTotal++
+			gateDuration += test.Duration
+
+			if test.Status == types.TestStatusPass {
+				gatePassed++
+			} else if test.Status == types.TestStatusFail {
+				gateFailed++
+			}
+
 			metrics.RecordIndividualTest(
 				n.networkName,
 				n.result.RunID,
@@ -591,6 +606,14 @@ func (n *nat) runTests(ctx context.Context) error {
 				test.Duration,
 			)
 
+			// Record duration histogram
+			metrics.RecordTestDurationHistogram(n.networkName, testName, gate.ID, "", test.Duration)
+
+			// Check for timeout in error message
+			if test.Error != nil && strings.Contains(test.Error.Error(), "timeout") {
+				metrics.RecordTestTimeout(n.networkName, n.result.RunID, testName, gate.ID, "")
+			}
+
 			// Record subtests if present
 			for subTestName, subTest := range test.SubTests {
 				metrics.RecordIndividualTest(
@@ -602,12 +625,37 @@ func (n *nat) runTests(ctx context.Context) error {
 					subTest.Status,
 					subTest.Duration,
 				)
+
+				// Record subtest duration histogram
+				metrics.RecordTestDurationHistogram(n.networkName, subTestName, gate.ID, "", subTest.Duration)
+
+				// Check for timeout in subtest
+				if subTest.Error != nil && strings.Contains(subTest.Error.Error(), "timeout") {
+					metrics.RecordTestTimeout(n.networkName, n.result.RunID, subTestName, gate.ID, "")
+				}
 			}
 		}
 
 		// Record suite tests
 		for suiteName, suite := range gate.Suites {
+			// Calculate suite-level aggregates
+			suiteTotal := 0
+			suitePassed := 0
+			suiteFailed := 0
+
 			for testName, test := range suite.Tests {
+				gateTotal++
+				suiteTotal++
+				gateDuration += test.Duration
+
+				if test.Status == types.TestStatusPass {
+					gatePassed++
+					suitePassed++
+				} else if test.Status == types.TestStatusFail {
+					gateFailed++
+					suiteFailed++
+				}
+
 				metrics.RecordIndividualTest(
 					n.networkName,
 					n.result.RunID,
@@ -618,6 +666,14 @@ func (n *nat) runTests(ctx context.Context) error {
 					test.Duration,
 				)
 
+				// Record duration histogram
+				metrics.RecordTestDurationHistogram(n.networkName, testName, gate.ID, suiteName, test.Duration)
+
+				// Check for timeout
+				if test.Error != nil && strings.Contains(test.Error.Error(), "timeout") {
+					metrics.RecordTestTimeout(n.networkName, n.result.RunID, testName, gate.ID, suiteName)
+				}
+
 				// Record subtests if present
 				for subTestName, subTest := range test.SubTests {
 					metrics.RecordIndividualTest(
@@ -629,8 +685,26 @@ func (n *nat) runTests(ctx context.Context) error {
 						subTest.Status,
 						subTest.Duration,
 					)
+
+					// Record subtest duration histogram
+					metrics.RecordTestDurationHistogram(n.networkName, subTestName, gate.ID, suiteName, subTest.Duration)
+
+					// Check for timeout in subtest
+					if subTest.Error != nil && strings.Contains(subTest.Error.Error(), "timeout") {
+						metrics.RecordTestTimeout(n.networkName, n.result.RunID, subTestName, gate.ID, suiteName)
+					}
 				}
 			}
+
+			// Record suite-level metrics
+			if suiteTotal > 0 {
+				metrics.RecordSuiteMetrics(n.networkName, gate.ID, suiteName, suiteTotal, suitePassed, suiteFailed)
+			}
+		}
+
+		// Record gate-level metrics
+		if gateTotal > 0 {
+			metrics.RecordGateMetrics(n.networkName, n.result.RunID, gate.ID, gateTotal, gatePassed, gateFailed, gateDuration)
 		}
 	}