Skip to content

Commit 4961a42

Browse files
committed
Metrics Summary
Adds a util to the e2e suite which queries prometheus at the end of the test run for alerts and metrics data. This data is then processed into markdown which is displayed to the contributor at the end of their test runs. Signed-off-by: Daniel Franz <[email protected]>
1 parent e0b5c18 commit 4961a42

File tree

11 files changed

+270
-18
lines changed

11 files changed

+270
-18
lines changed

Makefile

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -272,27 +272,21 @@ test-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST)
272272
test-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
273273
test-e2e: GO_BUILD_EXTRA_FLAGS := -cover
274274
test-e2e: COVERAGE_NAME := e2e
275-
test-e2e: run image-registry prometheus e2e e2e-metrics e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster
275+
test-e2e: run image-registry prometheus e2e e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster
276276

277277
.PHONY: test-experimental-e2e
278278
test-experimental-e2e: SOURCE_MANIFEST := $(EXPERIMENTAL_E2E_MANIFEST)
279279
test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
280280
test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover
281281
test-experimental-e2e: COVERAGE_NAME := experimental-e2e
282-
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
282+
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
283283

284284
.PHONY: prometheus
285285
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
286286
prometheus: PROMETHEUS_VERSION := v0.83.0
287287
prometheus: #EXHELP Deploy Prometheus into specified namespace
288288
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION)
289289

290-
# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format.
291-
.PHONY: e2e-metrics
292-
e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
293-
e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
294-
curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
295-
296290
.PHONY: extension-developer-e2e
297291
extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e
298292
extension-developer-e2e: export INSTALL_DEFAULT_CATALOGS := false

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ spec:
2222
annotations:
2323
description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}"
2424
- alert: operator-controller-memory-growth
25-
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
25+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000
2626
for: 5m
2727
keep_firing_for: 1d
2828
annotations:
2929
description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
3030
- alert: catalogd-memory-growth
31-
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
31+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000
3232
for: 5m
3333
keep_firing_for: 1d
3434
annotations:

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ require (
2323
github.com/operator-framework/helm-operator-plugins v0.8.0
2424
github.com/operator-framework/operator-registry v1.56.0
2525
github.com/prometheus/client_golang v1.23.0
26+
github.com/prometheus/common v0.65.0
2627
github.com/spf13/cobra v1.9.1
2728
github.com/stretchr/testify v1.10.0
2829
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
@@ -177,7 +178,6 @@ require (
177178
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
178179
github.com/proglottis/gpgme v0.1.4 // indirect
179180
github.com/prometheus/client_model v0.6.2 // indirect
180-
github.com/prometheus/common v0.65.0 // indirect
181181
github.com/prometheus/procfs v0.16.1 // indirect
182182
github.com/rivo/uniseg v0.4.7 // indirect
183183
github.com/rubenv/sql-migrate v1.8.0 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,8 @@ github.com/joelanford/ignore v0.1.1 h1:vKky5RDoPT+WbONrbQBgOn95VV/UPh4ejlyAbbzgn
279279
github.com/joelanford/ignore v0.1.1/go.mod h1:8eho/D8fwQ3rIXrLwE23AaeaGDNXqLE9QJ3zJ4LIPCw=
280280
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
281281
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
282+
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
283+
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
282284
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
283285
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
284286
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
@@ -352,6 +354,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/
352354
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4=
353355
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
354356
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
357+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
358+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
355359
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
356360
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
357361
github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=

test/e2e/e2e_suite_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
ocv1 "github.com/operator-framework/operator-controller/api/v1"
1717
"github.com/operator-framework/operator-controller/internal/operator-controller/scheme"
18+
utils "github.com/operator-framework/operator-controller/test/utils"
1819
)
1920

2021
var (
@@ -23,9 +24,10 @@ var (
2324
)
2425

2526
const (
26-
testCatalogRefEnvVar = "CATALOG_IMG"
27-
testCatalogName = "test-catalog"
28-
latestImageTag = "latest"
27+
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testCatalogRefEnvVar = "CATALOG_IMG"
29+
testCatalogName = "test-catalog"
30+
latestImageTag = "latest"
2931
)
3032

3133
func TestMain(m *testing.M) {
@@ -36,7 +38,10 @@ func TestMain(m *testing.M) {
3638
c, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
3739
utilruntime.Must(err)
3840

39-
os.Exit(m.Run())
41+
res := m.Run()
42+
err = utils.PrintSummary(testSummaryOutputEnvVar)
43+
utilruntime.Must(err)
44+
os.Exit(res)
4045
}
4146

4247
// createTestCatalog will create a new catalog on the test cluster, provided

test/e2e/metrics_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,15 +129,15 @@ func (c *MetricsTestConfig) getServiceAccountToken(t *testing.T) string {
129129
func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) {
130130
t.Logf("Creating curl pod (%s/%s) to validate the metrics endpoint", c.namespace, c.curlPodName)
131131
cmd := exec.Command(c.client, "run", c.curlPodName,
132-
"--image=curlimages/curl",
132+
"--image=curlimages/curl:8.15.0",
133133
"--namespace", c.namespace,
134134
"--restart=Never",
135135
"--overrides", `{
136136
"spec": {
137137
"terminationGradePeriodSeconds": 0,
138138
"containers": [{
139139
"name": "curl",
140-
"image": "curlimages/curl",
140+
"image": "curlimages/curl:8.15.0",
141141
"command": ["sh", "-c", "sleep 3600"],
142142
"securityContext": {
143143
"allowPrivilegeEscalation": false,

test/utils/summary.go

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
package utils
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"math"
7+
"os"
8+
"path/filepath"
9+
"strings"
10+
"text/template"
11+
"time"
12+
13+
"github.com/prometheus/client_golang/api"
14+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
15+
"github.com/prometheus/common/model"
16+
)
17+
18+
var (
19+
summaryTemplate = "summary.md.tmpl"
20+
alertsTemplate = "alert.md.tmpl"
21+
chartTemplate = "mermaid_chart.md.tmpl"
22+
defaultPromUrl = "http://localhost:30900"
23+
)
24+
25+
type summaryAlerts struct {
26+
FiringAlerts []summaryAlert
27+
PendingAlerts []summaryAlert
28+
}
29+
30+
type summaryAlert struct {
31+
v1.Alert
32+
Name string
33+
Description string
34+
}
35+
36+
type xychart struct {
37+
Title string
38+
YMax float64
39+
YMin float64
40+
YLabel string
41+
Data string
42+
}
43+
44+
type githubSummary struct {
45+
client api.Client
46+
Pods []string
47+
}
48+
49+
func NewSummary(c api.Client, pods ...string) githubSummary {
50+
return githubSummary{
51+
client: c,
52+
Pods: pods,
53+
}
54+
}
55+
56+
// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data
57+
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
58+
v1api := v1.NewAPI(s.client)
59+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
60+
defer cancel()
61+
62+
fullQuery := fmt.Sprintf(query, pod)
63+
result, warnings, err := v1api.Query(ctx, fullQuery, time.Now())
64+
if err != nil {
65+
return "", err
66+
} else if len(warnings) > 0 {
67+
fmt.Printf("warnings returned from performance query; query=%s, warnings=%v", fullQuery, warnings)
68+
} else if result.Type() != model.ValMatrix {
69+
return "", fmt.Errorf("incompatible result type; need: %s, got: %s", model.ValMatrix, result.Type().String())
70+
}
71+
72+
matrix, ok := result.(model.Matrix)
73+
if !ok {
74+
return "", fmt.Errorf("typecast for metrics samples failed; aborting")
75+
} else if len(matrix) > 1 {
76+
return "", fmt.Errorf("expected 1 set of results; got: %d", len(matrix))
77+
}
78+
chart := xychart{
79+
Title: title,
80+
YLabel: yLabel,
81+
YMax: math.SmallestNonzeroFloat64,
82+
YMin: math.MaxFloat64,
83+
}
84+
formattedData := make([]string, 0)
85+
// matrix does not allow [] access, so we just do one iteration for the single result
86+
for _, metric := range matrix {
87+
if len(metric.Values) < 1 {
88+
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
89+
}
90+
for _, sample := range metric.Values {
91+
floatSample := float64(sample.Value) * scaler
92+
formattedData = append(formattedData, fmt.Sprintf("%f", floatSample))
93+
if floatSample > chart.YMax {
94+
chart.YMax = floatSample
95+
}
96+
if floatSample < chart.YMin {
97+
chart.YMin = floatSample
98+
}
99+
}
100+
}
101+
// Add some padding
102+
chart.YMax = (chart.YMax + (math.Abs(chart.YMax) * 0.05))
103+
chart.YMin = (chart.YMin - (math.Abs(chart.YMin) * 0.05))
104+
// Pretty print the values, ex: [1,2,3,4]
105+
chart.Data = strings.ReplaceAll(fmt.Sprintf("%v", formattedData), " ", ",")
106+
107+
return executeTemplate(chartTemplate, chart)
108+
}
109+
110+
// Alerts queries the prometheus server for alerts and generates markdown output for anything found.
111+
// If no alerts are found, the alerts section will contain only "None." in the final output.
112+
func (s githubSummary) Alerts() (string, error) {
113+
v1api := v1.NewAPI(s.client)
114+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
115+
defer cancel()
116+
result, err := v1api.Alerts(ctx)
117+
if err != nil {
118+
fmt.Printf("Error querying Prometheus: %v\n", err)
119+
os.Exit(1)
120+
}
121+
122+
firingAlerts := make([]summaryAlert, 0)
123+
pendingAlerts := make([]summaryAlert, 0)
124+
if len(result.Alerts) > 0 {
125+
for _, a := range result.Alerts {
126+
aConv := summaryAlert{
127+
Alert: a,
128+
Name: string(a.Labels["alertname"]),
129+
Description: string(a.Annotations["description"]),
130+
}
131+
switch a.State {
132+
case v1.AlertStateFiring:
133+
firingAlerts = append(firingAlerts, aConv)
134+
case v1.AlertStatePending:
135+
pendingAlerts = append(pendingAlerts, aConv)
136+
// Ignore AlertStateInactive; the alerts endpoint doesn't return them
137+
}
138+
}
139+
} else {
140+
return "None.", nil
141+
}
142+
143+
return executeTemplate(alertsTemplate, summaryAlerts{
144+
FiringAlerts: firingAlerts,
145+
PendingAlerts: pendingAlerts,
146+
})
147+
}
148+
149+
func executeTemplate(templateFile string, obj any) (string, error) {
150+
wd, err := os.Getwd()
151+
if err != nil {
152+
return "", fmt.Errorf("failed to get working directory: %w", err)
153+
}
154+
tmpl, err := template.New(templateFile).ParseGlob(filepath.Join(wd, "../utils/templates", templateFile))
155+
if err != nil {
156+
return "", err
157+
}
158+
buffer := new(strings.Builder)
159+
err = tmpl.Execute(buffer, obj)
160+
if err != nil {
161+
return "", err
162+
}
163+
return buffer.String(), nil
164+
}
165+
166+
// PrintSummary executes the main summary template, generating the full test report.
167+
// The markdown is template-driven; the summary methods are called from within the
168+
// template. This allows us to add or change queries (hopefully) without needing to
169+
// touch code. The summary will be output to a file supplied by the env target.
170+
func PrintSummary(envTarget string) error {
171+
client, err := api.NewClient(api.Config{
172+
Address: defaultPromUrl,
173+
})
174+
if err != nil {
175+
fmt.Printf("Error creating prometheus client: %v\n", err)
176+
os.Exit(1)
177+
}
178+
179+
summary := NewSummary(client, "operator-controller", "catalogd")
180+
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
181+
if err != nil {
182+
return err
183+
}
184+
if path := os.Getenv(envTarget); path != "" {
185+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
186+
if err != nil {
187+
return err
188+
}
189+
fmt.Printf("Test summary output to %s successful\n", envTarget)
190+
} else {
191+
fmt.Printf("No summary output specified; skipping")
192+
}
193+
return nil
194+
}

test/utils/templates/alert.md.tmpl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{{- /* -------------------- Alert Template --------------------- */ -}}
2+
{{define "alert"}}
3+
| {{ .Name }} | {{ .Description }} |
4+
| -------- | ------- |
5+
| ActiveAt | {{ .ActiveAt }} |
6+
| State | {{ .State }} |
7+
{{- end}}
8+
9+
### Firing Alerts
10+
{{ range .FiringAlerts }}
11+
{{ template "alert" .}}
12+
{{ end }}
13+
### Pending Alerts
14+
{{ range .PendingAlerts }}
15+
{{ template "alert" .}}
16+
{{ end }}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<details>
2+
3+
```mermaid
4+
---
5+
config:
6+
xyChart:
7+
showDataLabel: true
8+
xAxis:
9+
showLabel: false
10+
---
11+
xychart-beta
12+
title "{{ .Title }}"
13+
y-axis "{{ .YLabel }}" {{printf "%f" .YMin}} --> {{printf "%f" .YMax}}
14+
x-axis "time (start of test to end)"
15+
line {{.Data}}
16+
```
17+
</details>

test/utils/templates/summary.md.tmpl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
{{- /* ------------ Performance Statistics Template ------------ */ -}}
3+
{{define "performanceStatistics" -}}
4+
{{ range $index, $pod := .Pods }}
5+
### {{$pod}}
6+
#### Memory Usage
7+
{{$.PerformanceQuery "Memory Usage" $pod `container_memory_working_set_bytes{pod=~"%s.*",container="manager"}[5m]` "MB" .000001}}
8+
9+
#### Memory Growth Rate
10+
{{$.PerformanceQuery "Memory Growth Rate" $pod `deriv(sum(container_memory_working_set_bytes{pod=~"%s.*",container="manager"})[5m:])[5m:]` "KB/s" .001}}
11+
12+
#### CPU Usage
13+
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
{{end}}
15+
{{- end}}
16+
17+
{{- /* ----------------- E2E Summary Markdown ------------------ */ -}}
18+
# E2E Summary
19+
## Alerts
20+
{{.Alerts}}
21+
## Performance
22+
{{ template "performanceStatistics" . -}}

0 commit comments

Comments
 (0)