Skip to content

Commit 37b6efa

Browse files
committed
Metrics Summary
Adds a util to the e2e suite which queries prometheus at the end of the test run for alerts and metrics data. This data is then processed into markdown which is displayed to the contributor at the end of their test runs. Signed-off-by: Daniel Franz <[email protected]>
1 parent e0b5c18 commit 37b6efa

File tree

11 files changed

+268
-19
lines changed

11 files changed

+268
-19
lines changed

Makefile

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -271,28 +271,21 @@ image-registry: ## Build the testdata catalog used for e2e tests and push it to
271271
test-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST)
272272
test-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
273273
test-e2e: GO_BUILD_EXTRA_FLAGS := -cover
274-
test-e2e: COVERAGE_NAME := e2e
275-
test-e2e: run image-registry prometheus e2e e2e-metrics e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster
274+
test-e2e: run image-registry prometheus e2e e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster
276275

277276
.PHONY: test-experimental-e2e
278277
test-experimental-e2e: SOURCE_MANIFEST := $(EXPERIMENTAL_E2E_MANIFEST)
279278
test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
280279
test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover
281280
test-experimental-e2e: COVERAGE_NAME := experimental-e2e
282-
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
281+
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
283282

284283
.PHONY: prometheus
285284
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
286285
prometheus: PROMETHEUS_VERSION := v0.83.0
287286
prometheus: #EXHELP Deploy Prometheus into specified namespace
288287
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION)
289288

290-
# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format.
291-
.PHONY: e2e-metrics
292-
e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
293-
e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
294-
curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
295-
296289
.PHONY: extension-developer-e2e
297290
extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e
298291
extension-developer-e2e: export INSTALL_DEFAULT_CATALOGS := false

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ spec:
2222
annotations:
2323
description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}"
2424
- alert: operator-controller-memory-growth
25-
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
25+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000
2626
for: 5m
2727
keep_firing_for: 1d
2828
annotations:
2929
description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
3030
- alert: catalogd-memory-growth
31-
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
31+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000
3232
for: 5m
3333
keep_firing_for: 1d
3434
annotations:

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ require (
2323
github.com/operator-framework/helm-operator-plugins v0.8.0
2424
github.com/operator-framework/operator-registry v1.56.0
2525
github.com/prometheus/client_golang v1.23.0
26+
github.com/prometheus/common v0.65.0
2627
github.com/spf13/cobra v1.9.1
2728
github.com/stretchr/testify v1.10.0
2829
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
@@ -177,7 +178,6 @@ require (
177178
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
178179
github.com/proglottis/gpgme v0.1.4 // indirect
179180
github.com/prometheus/client_model v0.6.2 // indirect
180-
github.com/prometheus/common v0.65.0 // indirect
181181
github.com/prometheus/procfs v0.16.1 // indirect
182182
github.com/rivo/uniseg v0.4.7 // indirect
183183
github.com/rubenv/sql-migrate v1.8.0 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,8 @@ github.com/joelanford/ignore v0.1.1 h1:vKky5RDoPT+WbONrbQBgOn95VV/UPh4ejlyAbbzgn
279279
github.com/joelanford/ignore v0.1.1/go.mod h1:8eho/D8fwQ3rIXrLwE23AaeaGDNXqLE9QJ3zJ4LIPCw=
280280
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
281281
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
282+
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
283+
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
282284
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
283285
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
284286
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
@@ -352,6 +354,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/
352354
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4=
353355
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
354356
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
357+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
358+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
355359
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
356360
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
357361
github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=

test/e2e/e2e_suite_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
ocv1 "github.com/operator-framework/operator-controller/api/v1"
1717
"github.com/operator-framework/operator-controller/internal/operator-controller/scheme"
18+
utils "github.com/operator-framework/operator-controller/test/utils"
1819
)
1920

2021
var (
@@ -23,9 +24,10 @@ var (
2324
)
2425

2526
const (
26-
testCatalogRefEnvVar = "CATALOG_IMG"
27-
testCatalogName = "test-catalog"
28-
latestImageTag = "latest"
27+
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testCatalogRefEnvVar = "CATALOG_IMG"
29+
testCatalogName = "test-catalog"
30+
latestImageTag = "latest"
2931
)
3032

3133
func TestMain(m *testing.M) {
@@ -36,7 +38,10 @@ func TestMain(m *testing.M) {
3638
c, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
3739
utilruntime.Must(err)
3840

39-
os.Exit(m.Run())
41+
res := m.Run()
42+
err = utils.PrintSummary(testSummaryOutputEnvVar)
43+
utilruntime.Must(err)
44+
os.Exit(res)
4045
}
4146

4247
// createTestCatalog will create a new catalog on the test cluster, provided

test/e2e/metrics_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,15 +129,15 @@ func (c *MetricsTestConfig) getServiceAccountToken(t *testing.T) string {
129129
func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) {
130130
t.Logf("Creating curl pod (%s/%s) to validate the metrics endpoint", c.namespace, c.curlPodName)
131131
cmd := exec.Command(c.client, "run", c.curlPodName,
132-
"--image=curlimages/curl",
132+
"--image=curlimages/curl:8.15.0",
133133
"--namespace", c.namespace,
134134
"--restart=Never",
135135
"--overrides", `{
136136
"spec": {
137137
"terminationGradePeriodSeconds": 0,
138138
"containers": [{
139139
"name": "curl",
140-
"image": "curlimages/curl",
140+
"image": "curlimages/curl:8.15.0",
141141
"command": ["sh", "-c", "sleep 3600"],
142142
"securityContext": {
143143
"allowPrivilegeEscalation": false,

test/utils/summary.go

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
package utils
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"math"
7+
"os"
8+
"path/filepath"
9+
"strings"
10+
"text/template"
11+
"time"
12+
13+
"github.com/prometheus/client_golang/api"
14+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
15+
"github.com/prometheus/common/model"
16+
)
17+
18+
var (
19+
summaryTemplate = "summary.md.tmpl"
20+
alertsTemplate = "alert.md.tmpl"
21+
chartTemplate = "mermaid_chart.md.tmpl"
22+
defaultPromUrl = "http://localhost:30900"
23+
)
24+
25+
type summaryAlerts struct {
26+
FiringAlerts []summaryAlert
27+
PendingAlerts []summaryAlert
28+
}
29+
30+
type summaryAlert struct {
31+
v1.Alert
32+
Name string
33+
Description string
34+
}
35+
36+
type xychart struct {
37+
Title string
38+
YMax float64
39+
YMin float64
40+
YLabel string
41+
Data string
42+
}
43+
44+
type githubSummary struct {
45+
client api.Client
46+
Pods []string
47+
}
48+
49+
func NewSummary(c api.Client, pods ...string) githubSummary {
50+
return githubSummary{
51+
client: c,
52+
Pods: pods,
53+
}
54+
}
55+
56+
// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data
57+
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
58+
v1api := v1.NewAPI(s.client)
59+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
60+
defer cancel()
61+
62+
fullQuery := fmt.Sprintf(query, pod)
63+
result, warnings, err := v1api.Query(ctx, fullQuery, time.Now())
64+
if err != nil {
65+
return "", err
66+
} else if len(warnings) > 0 {
67+
fmt.Printf("warnings returned from performance query; query=%s, warnings=%v", fullQuery, warnings)
68+
} else if result.Type() != model.ValMatrix {
69+
return "", fmt.Errorf("incompatible result type; need: %s, got: %s", model.ValMatrix, result.Type().String())
70+
}
71+
72+
matrix, ok := result.(model.Matrix)
73+
if !ok {
74+
return "", fmt.Errorf("typecast for metrics samples failed; aborting")
75+
} else if len(matrix) > 1 {
76+
return "", fmt.Errorf("expected 1 set of results; got: %d", len(matrix))
77+
}
78+
chart := xychart{
79+
Title: title,
80+
YLabel: yLabel,
81+
YMax: math.SmallestNonzeroFloat64,
82+
YMin: math.MaxFloat64,
83+
}
84+
formattedData := make([]string, 0)
85+
// matrix does not allow [] access, so we just do one iteration for the single result
86+
for _, metric := range matrix {
87+
if len(metric.Values) < 1 {
88+
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
89+
}
90+
for _, sample := range metric.Values {
91+
floatSample := float64(sample.Value) * scaler
92+
formattedData = append(formattedData, fmt.Sprintf("%f", floatSample))
93+
if floatSample > chart.YMax {
94+
chart.YMax = floatSample
95+
}
96+
if floatSample < chart.YMin {
97+
chart.YMin = floatSample
98+
}
99+
}
100+
}
101+
// Add some padding
102+
chart.YMax = (chart.YMax + (math.Abs(chart.YMax) * 0.05))
103+
chart.YMin = (chart.YMin - (math.Abs(chart.YMin) * 0.05))
104+
// Pretty print the values, ex: [1,2,3,4]
105+
chart.Data = strings.ReplaceAll(fmt.Sprintf("%v", formattedData), " ", ",")
106+
107+
return executeTemplate(chartTemplate, chart)
108+
}
109+
110+
// Alerts queries the prometheus server for alerts and generates markdown output for anything found.
111+
// If no alerts are found, the alerts section will contain only "None." in the final output.
112+
func (s githubSummary) Alerts() (string, error) {
113+
v1api := v1.NewAPI(s.client)
114+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
115+
defer cancel()
116+
result, err := v1api.Alerts(ctx)
117+
if err != nil {
118+
fmt.Printf("Error querying Prometheus: %v\n", err)
119+
os.Exit(1)
120+
}
121+
122+
firingAlerts := make([]summaryAlert, 0)
123+
pendingAlerts := make([]summaryAlert, 0)
124+
if len(result.Alerts) > 0 {
125+
for _, a := range result.Alerts {
126+
aConv := summaryAlert{
127+
Alert: a,
128+
Name: string(a.Labels["alertname"]),
129+
Description: string(a.Annotations["description"]),
130+
}
131+
if a.State == v1.AlertStateFiring {
132+
firingAlerts = append(firingAlerts, aConv)
133+
} else if a.State == v1.AlertStatePending {
134+
pendingAlerts = append(pendingAlerts, aConv)
135+
}
136+
}
137+
} else {
138+
return "None.", nil
139+
}
140+
141+
return executeTemplate(alertsTemplate, summaryAlerts{
142+
FiringAlerts: firingAlerts,
143+
PendingAlerts: pendingAlerts,
144+
})
145+
}
146+
147+
func executeTemplate(templateFile string, obj any) (string, error) {
148+
wd, err := os.Getwd()
149+
if err != nil {
150+
return "", fmt.Errorf("failed to get working directory: %w", err)
151+
}
152+
tmpl, err := template.New(templateFile).ParseGlob(filepath.Join(wd, "../utils/templates", templateFile))
153+
if err != nil {
154+
return "", err
155+
}
156+
buffer := new(strings.Builder)
157+
err = tmpl.Execute(buffer, obj)
158+
if err != nil {
159+
return "", err
160+
}
161+
return buffer.String(), nil
162+
}
163+
164+
// PrintSummary executes the main summary template, generating the full test report.
165+
// The markdown is template-driven; the summary methods are called from within the
166+
// template. This allows us to add or change queries (hopefully) without needing to
167+
// touch code. The summary will be output to a file supplied by the env target.
168+
func PrintSummary(envTarget string) error {
169+
client, err := api.NewClient(api.Config{
170+
Address: defaultPromUrl,
171+
})
172+
if err != nil {
173+
fmt.Printf("Error creating prometheus client: %v\n", err)
174+
os.Exit(1)
175+
}
176+
177+
summary := NewSummary(client, "operator-controller", "catalogd")
178+
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
179+
if err != nil {
180+
return err
181+
}
182+
if path := os.Getenv(envTarget); path != "" {
183+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
184+
if err != nil {
185+
return err
186+
}
187+
fmt.Printf("Test summary output to %s successful\n", envTarget)
188+
} else {
189+
fmt.Printf("No summary output specified; skipping")
190+
}
191+
return nil
192+
}

test/utils/templates/alert.md.tmpl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{{- /* -------------------- Alert Template --------------------- */ -}}
2+
{{define "alert"}}
3+
| {{ .Name }} | {{ .Description }} |
4+
| -------- | ------- |
5+
| ActiveAt | {{ .ActiveAt }} |
6+
| State | {{ .State }} |
7+
{{- end}}
8+
9+
### Firing Alerts
10+
{{ range .FiringAlerts }}
11+
{{ template "alert" .}}
12+
{{ end }}
13+
### Pending Alerts
14+
{{ range .PendingAlerts }}
15+
{{ template "alert" .}}
16+
{{ end }}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<details>
2+
3+
```mermaid
4+
---
5+
config:
6+
xyChart:
7+
showDataLabel: true
8+
xAxis:
9+
showLabel: false
10+
---
11+
xychart-beta
12+
title "{{ .Title }}"
13+
y-axis "{{ .YLabel }}" {{printf "%f" .YMin}} --> {{printf "%f" .YMax}}
14+
x-axis "time (start of test to end)"
15+
line {{.Data}}
16+
```
17+
</details>

test/utils/templates/summary.md.tmpl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
{{- /* ------------ Performance Statistics Template ------------ */ -}}
3+
{{define "performanceStatistics" -}}
4+
{{ range $index, $pod := .Pods }}
5+
### {{$pod}}
6+
#### Memory Usage
7+
{{$.PerformanceQuery "Memory Usage" $pod `container_memory_working_set_bytes{pod=~"%s.*",container="manager"}[5m]` "MB" .000001}}
8+
9+
#### Memory Growth Rate
10+
{{$.PerformanceQuery "Memory Growth Rate" $pod `deriv(sum(container_memory_working_set_bytes{pod=~"%s.*",container="manager"})[5m:])[5m:]` "KB/s" .001}}
11+
12+
#### CPU Usage
13+
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
{{end}}
15+
{{- end}}
16+
17+
{{- /* ----------------- E2E Summary Markdown ------------------ */ -}}
18+
# E2E Summary
19+
## Alerts
20+
{{.Alerts}}
21+
## Performance
22+
{{ template "performanceStatistics" . -}}

0 commit comments

Comments
 (0)