Skip to content

Commit 2256f8a

Browse files
authored
feat: metrics improvements to op-acceptor (#503)
1 parent 1721c99 commit 2256f8a

File tree

2 files changed

+218
-9
lines changed

2 files changed

+218
-9
lines changed

op-acceptor/metrics/metrics.go

Lines changed: 143 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ var (
107107
Help: "Total number of tests run (aggregate counter without run_id)",
108108
}, []string{
109109
"network_name",
110+
"test_name",
111+
"gate",
112+
"suite",
110113
})
111114

112115
testsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -115,6 +118,9 @@ var (
115118
Help: "Total number of passed tests (aggregate counter without run_id)",
116119
}, []string{
117120
"network_name",
121+
"test_name",
122+
"gate",
123+
"suite",
118124
})
119125

120126
testsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -123,6 +129,9 @@ var (
123129
Help: "Total number of failed tests (aggregate counter without run_id)",
124130
}, []string{
125131
"network_name",
132+
"test_name",
133+
"gate",
134+
"suite",
126135
})
127136

128137
testsSkipped = promauto.NewCounterVec(prometheus.CounterOpts{
@@ -131,6 +140,9 @@ var (
131140
Help: "Total number of skipped tests (aggregate counter without run_id)",
132141
}, []string{
133142
"network_name",
143+
"test_name",
144+
"gate",
145+
"suite",
134146
})
135147

136148
// Metrics for individual test tracking
@@ -157,6 +169,101 @@ var (
157169
"gate",
158170
"suite",
159171
})
172+
173+
// Test duration histogram to track distribution of test execution times
174+
testDurationHistogram = promauto.NewHistogramVec(prometheus.HistogramOpts{
175+
Namespace: MetricsNamespace,
176+
Name: "test_duration_histogram_seconds",
177+
Help: "Histogram of test execution durations in seconds",
178+
Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600}, // 100ms to 10min
179+
}, []string{
180+
"network_name",
181+
"test_name",
182+
"gate",
183+
"suite",
184+
})
185+
186+
// Test timeout tracking
187+
testTimeouts = promauto.NewCounterVec(prometheus.CounterOpts{
188+
Namespace: MetricsNamespace,
189+
Name: "test_timeouts_total",
190+
Help: "Total number of tests that timed out",
191+
}, []string{
192+
"network_name",
193+
"run_id",
194+
"test_name",
195+
"gate",
196+
"suite",
197+
})
198+
199+
// Gate-level aggregated metrics
200+
gateTestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
201+
Namespace: MetricsNamespace,
202+
Name: "gate_tests_total",
203+
Help: "Total number of tests per gate",
204+
}, []string{
205+
"network_name",
206+
"gate",
207+
})
208+
209+
gateTestsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
210+
Namespace: MetricsNamespace,
211+
Name: "gate_tests_passed_total",
212+
Help: "Total number of passed tests per gate",
213+
}, []string{
214+
"network_name",
215+
"gate",
216+
})
217+
218+
gateTestsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
219+
Namespace: MetricsNamespace,
220+
Name: "gate_tests_failed_total",
221+
Help: "Total number of failed tests per gate",
222+
}, []string{
223+
"network_name",
224+
"gate",
225+
})
226+
227+
gateDurationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
228+
Namespace: MetricsNamespace,
229+
Name: "gate_duration_seconds",
230+
Help: "Duration of gate execution in seconds",
231+
}, []string{
232+
"network_name",
233+
"run_id",
234+
"gate",
235+
})
236+
237+
// Suite-level metrics
238+
suiteTestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
239+
Namespace: MetricsNamespace,
240+
Name: "suite_tests_total",
241+
Help: "Total number of tests per suite",
242+
}, []string{
243+
"network_name",
244+
"gate",
245+
"suite",
246+
})
247+
248+
suiteTestsPassed = promauto.NewCounterVec(prometheus.CounterOpts{
249+
Namespace: MetricsNamespace,
250+
Name: "suite_tests_passed_total",
251+
Help: "Total number of passed tests per suite",
252+
}, []string{
253+
"network_name",
254+
"gate",
255+
"suite",
256+
})
257+
258+
suiteTestsFailed = promauto.NewCounterVec(prometheus.CounterOpts{
259+
Namespace: MetricsNamespace,
260+
Name: "suite_tests_failed_total",
261+
Help: "Total number of failed tests per suite",
262+
}, []string{
263+
"network_name",
264+
"gate",
265+
"suite",
266+
})
160267
)
161268

162269
// errToLabel tries to make the error string a more valid Prometheus label
@@ -232,14 +339,6 @@ func RecordAcceptance(
232339
}
233340

234341
testRunDurationSeconds.WithLabelValues(network, runID).Set(duration.Seconds())
235-
236-
// Also record to the continuous counters without run_id
237-
testsTotal.WithLabelValues(network).Add(float64(total))
238-
testsPassed.WithLabelValues(network).Add(float64(passed))
239-
testsFailed.WithLabelValues(network).Add(float64(failed))
240-
if skipped > 0 {
241-
testsSkipped.WithLabelValues(network).Add(float64(skipped))
242-
}
243342
}
244343

245344
// RecordIndividualTest records metrics for an individual test
@@ -265,8 +364,44 @@ func RecordIndividualTest(
265364

266365
testStatus.WithLabelValues(network, runID, testName, gate, suite).Set(statusValue)
267366
testDurationSeconds.WithLabelValues(network, runID, testName, gate, suite).Set(duration.Seconds())
367+
368+
// Also record to the continuous counters without run_id for time-based aggregation
369+
testsTotal.WithLabelValues(network, testName, gate, suite).Inc()
370+
switch status {
371+
case types.TestStatusPass:
372+
testsPassed.WithLabelValues(network, testName, gate, suite).Inc()
373+
case types.TestStatusFail:
374+
testsFailed.WithLabelValues(network, testName, gate, suite).Inc()
375+
case types.TestStatusSkip:
376+
testsSkipped.WithLabelValues(network, testName, gate, suite).Inc()
377+
}
268378
}
269379

270380
func isValidResult(result types.TestStatus) bool {
271381
return slices.Contains(validResults, result)
272382
}
383+
384+
// RecordTestDurationHistogram records test duration in a histogram for distribution analysis
385+
func RecordTestDurationHistogram(network string, testName string, gate string, suite string, duration time.Duration) {
386+
testDurationHistogram.WithLabelValues(network, testName, gate, suite).Observe(duration.Seconds())
387+
}
388+
389+
// RecordTestTimeout records when a test times out
390+
func RecordTestTimeout(network string, runID string, testName string, gate string, suite string) {
391+
testTimeouts.WithLabelValues(network, runID, testName, gate, suite).Inc()
392+
}
393+
394+
// RecordGateMetrics records aggregated metrics for a gate
395+
func RecordGateMetrics(network string, runID string, gate string, total int, passed int, failed int, duration time.Duration) {
396+
gateTestsTotal.WithLabelValues(network, gate).Add(float64(total))
397+
gateTestsPassed.WithLabelValues(network, gate).Add(float64(passed))
398+
gateTestsFailed.WithLabelValues(network, gate).Add(float64(failed))
399+
gateDurationSeconds.WithLabelValues(network, runID, gate).Set(duration.Seconds())
400+
}
401+
402+
// RecordSuiteMetrics records aggregated metrics for a suite
403+
func RecordSuiteMetrics(network string, gate string, suite string, total int, passed int, failed int) {
404+
suiteTestsTotal.WithLabelValues(network, gate, suite).Add(float64(total))
405+
suiteTestsPassed.WithLabelValues(network, gate, suite).Add(float64(passed))
406+
suiteTestsFailed.WithLabelValues(network, gate, suite).Add(float64(failed))
407+
}

op-acceptor/nat.go

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,10 +577,25 @@ func (n *nat) runTests(ctx context.Context) error {
577577
n.result.Duration,
578578
)
579579

580-
// Record metrics for individual tests
580+
// Record metrics for individual tests and aggregated gate/suite metrics
581581
for _, gate := range n.result.Gates {
582+
// Calculate gate-level aggregates
583+
gateTotal := 0
584+
gatePassed := 0
585+
gateFailed := 0
586+
var gateDuration time.Duration
587+
582588
// Record direct gate tests
583589
for testName, test := range gate.Tests {
590+
gateTotal++
591+
gateDuration += test.Duration
592+
593+
if test.Status == types.TestStatusPass {
594+
gatePassed++
595+
} else if test.Status == types.TestStatusFail {
596+
gateFailed++
597+
}
598+
584599
metrics.RecordIndividualTest(
585600
n.networkName,
586601
n.result.RunID,
@@ -591,6 +606,14 @@ func (n *nat) runTests(ctx context.Context) error {
591606
test.Duration,
592607
)
593608

609+
// Record duration histogram
610+
metrics.RecordTestDurationHistogram(n.networkName, testName, gate.ID, "", test.Duration)
611+
612+
// Check for timeout in error message
613+
if test.Error != nil && strings.Contains(test.Error.Error(), "timeout") {
614+
metrics.RecordTestTimeout(n.networkName, n.result.RunID, testName, gate.ID, "")
615+
}
616+
594617
// Record subtests if present
595618
for subTestName, subTest := range test.SubTests {
596619
metrics.RecordIndividualTest(
@@ -602,12 +625,37 @@ func (n *nat) runTests(ctx context.Context) error {
602625
subTest.Status,
603626
subTest.Duration,
604627
)
628+
629+
// Record subtest duration histogram
630+
metrics.RecordTestDurationHistogram(n.networkName, subTestName, gate.ID, "", subTest.Duration)
631+
632+
// Check for timeout in subtest
633+
if subTest.Error != nil && strings.Contains(subTest.Error.Error(), "timeout") {
634+
metrics.RecordTestTimeout(n.networkName, n.result.RunID, subTestName, gate.ID, "")
635+
}
605636
}
606637
}
607638

608639
// Record suite tests
609640
for suiteName, suite := range gate.Suites {
641+
// Calculate suite-level aggregates
642+
suiteTotal := 0
643+
suitePassed := 0
644+
suiteFailed := 0
645+
610646
for testName, test := range suite.Tests {
647+
gateTotal++
648+
suiteTotal++
649+
gateDuration += test.Duration
650+
651+
if test.Status == types.TestStatusPass {
652+
gatePassed++
653+
suitePassed++
654+
} else if test.Status == types.TestStatusFail {
655+
gateFailed++
656+
suiteFailed++
657+
}
658+
611659
metrics.RecordIndividualTest(
612660
n.networkName,
613661
n.result.RunID,
@@ -618,6 +666,14 @@ func (n *nat) runTests(ctx context.Context) error {
618666
test.Duration,
619667
)
620668

669+
// Record duration histogram
670+
metrics.RecordTestDurationHistogram(n.networkName, testName, gate.ID, suiteName, test.Duration)
671+
672+
// Check for timeout
673+
if test.Error != nil && strings.Contains(test.Error.Error(), "timeout") {
674+
metrics.RecordTestTimeout(n.networkName, n.result.RunID, testName, gate.ID, suiteName)
675+
}
676+
621677
// Record subtests if present
622678
for subTestName, subTest := range test.SubTests {
623679
metrics.RecordIndividualTest(
@@ -629,8 +685,26 @@ func (n *nat) runTests(ctx context.Context) error {
629685
subTest.Status,
630686
subTest.Duration,
631687
)
688+
689+
// Record subtest duration histogram
690+
metrics.RecordTestDurationHistogram(n.networkName, subTestName, gate.ID, suiteName, subTest.Duration)
691+
692+
// Check for timeout in subtest
693+
if subTest.Error != nil && strings.Contains(subTest.Error.Error(), "timeout") {
694+
metrics.RecordTestTimeout(n.networkName, n.result.RunID, subTestName, gate.ID, suiteName)
695+
}
632696
}
633697
}
698+
699+
// Record suite-level metrics
700+
if suiteTotal > 0 {
701+
metrics.RecordSuiteMetrics(n.networkName, gate.ID, suiteName, suiteTotal, suitePassed, suiteFailed)
702+
}
703+
}
704+
705+
// Record gate-level metrics
706+
if gateTotal > 0 {
707+
metrics.RecordGateMetrics(n.networkName, n.result.RunID, gate.ID, gateTotal, gatePassed, gateFailed, gateDuration)
634708
}
635709
}
636710

0 commit comments

Comments
 (0)