diff --git a/Makefile b/Makefile index 67c876a59..8596edee9 100644 --- a/Makefile +++ b/Makefile @@ -272,14 +272,14 @@ test-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST) test-e2e: KIND_CLUSTER_NAME := operator-controller-e2e test-e2e: GO_BUILD_EXTRA_FLAGS := -cover test-e2e: COVERAGE_NAME := e2e -test-e2e: run image-registry prometheus e2e e2e-metrics e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster +test-e2e: run image-registry prometheus e2e e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster .PHONY: test-experimental-e2e test-experimental-e2e: SOURCE_MANIFEST := $(EXPERIMENTAL_E2E_MANIFEST) test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover test-experimental-e2e: COVERAGE_NAME := experimental-e2e -test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster +test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster .PHONY: prometheus prometheus: PROMETHEUS_NAMESPACE := olmv1-system @@ -287,12 +287,6 @@ prometheus: PROMETHEUS_VERSION := v0.83.0 prometheus: #EXHELP Deploy Prometheus into specified namespace ./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION) -# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format. -.PHONY: e2e-metrics -e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out -e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set - curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH) - .PHONY: extension-developer-e2e extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e extension-developer-e2e: export INSTALL_DEFAULT_CATALOGS := false diff --git a/config/overlays/prometheus/prometheus_rule.yaml b/config/overlays/prometheus/prometheus_rule.yaml index 16e4bfd1a..5bd7e120b 100644 --- a/config/overlays/prometheus/prometheus_rule.yaml +++ b/config/overlays/prometheus/prometheus_rule.yaml @@ -22,13 +22,13 @@ spec: annotations: description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}" - alert: operator-controller-memory-growth - expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000 for: 5m keep_firing_for: 1d annotations: description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec" - alert: catalogd-memory-growth - expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000 for: 5m keep_firing_for: 1d annotations: diff --git a/go.mod b/go.mod index 7f2e8a9e3..02c716230 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( github.com/operator-framework/helm-operator-plugins v0.8.0 github.com/operator-framework/operator-registry v1.56.0 github.com/prometheus/client_golang v1.23.0 + github.com/prometheus/common v0.65.0 github.com/spf13/cobra v1.9.1 github.com/stretchr/testify v1.10.0 golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b @@ -177,7 +178,6 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/proglottis/gpgme v0.1.4 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.65.0 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rubenv/sql-migrate v1.8.0 // indirect diff --git a/go.sum b/go.sum index 897a5ba13..dedd5be96 100644 --- a/go.sum +++ b/go.sum @@ -279,6 +279,8 @@ github.com/joelanford/ignore v0.1.1 h1:vKky5RDoPT+WbONrbQBgOn95VV/UPh4ejlyAbbzgn github.com/joelanford/ignore v0.1.1/go.mod h1:8eho/D8fwQ3rIXrLwE23AaeaGDNXqLE9QJ3zJ4LIPCw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= @@ -352,6 +354,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY= diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 354ef75f4..dabfb48ca 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -15,6 +15,7 @@ import ( ocv1 "github.com/operator-framework/operator-controller/api/v1" "github.com/operator-framework/operator-controller/internal/operator-controller/scheme" + utils "github.com/operator-framework/operator-controller/test/utils" ) var ( @@ -23,9 +24,10 @@ var ( ) const ( - testCatalogRefEnvVar = "CATALOG_IMG" - testCatalogName = "test-catalog" - latestImageTag = "latest" + testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY" + testCatalogRefEnvVar = "CATALOG_IMG" + testCatalogName = "test-catalog" + latestImageTag = "latest" ) func TestMain(m *testing.M) { @@ -36,7 +38,10 @@ func TestMain(m *testing.M) { c, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) utilruntime.Must(err) - os.Exit(m.Run()) + res := m.Run() + err = utils.PrintSummary(testSummaryOutputEnvVar) + utilruntime.Must(err) + os.Exit(res) } // createTestCatalog will create a new catalog on the test cluster, provided diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 4a88c3dca..85908f4d5 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -129,7 +129,7 @@ func (c *MetricsTestConfig) getServiceAccountToken(t *testing.T) string { func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { t.Logf("Creating curl pod (%s/%s) to validate the metrics endpoint", c.namespace, c.curlPodName) cmd := exec.Command(c.client, "run", c.curlPodName, - "--image=curlimages/curl", + "--image=curlimages/curl:8.15.0", "--namespace", c.namespace, "--restart=Never", "--overrides", `{ @@ -137,7 +137,7 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { "terminationGradePeriodSeconds": 0, "containers": [{ "name": "curl", - "image": "curlimages/curl", + "image": "curlimages/curl:8.15.0", "command": ["sh", "-c", "sleep 3600"], "securityContext": { "allowPrivilegeEscalation": false, diff --git a/test/utils/summary.go b/test/utils/summary.go new file mode 100644 index 000000000..d91ae3239 --- /dev/null +++ b/test/utils/summary.go @@ -0,0 +1,199 @@ +package utils + +import ( + "context" + "fmt" + "math" + "os" + "path/filepath" + "strings" + "text/template" + "time" + + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" +) + +var ( + summaryTemplate = "summary.md.tmpl" + alertsTemplate = "alert.md.tmpl" + chartTemplate = "mermaid_chart.md.tmpl" + defaultPromUrl = "http://localhost:30900" +) + +type summaryAlerts struct { + FiringAlerts []summaryAlert + PendingAlerts []summaryAlert +} + +type summaryAlert struct { + v1.Alert + Name string + Description string +} + +type xychart struct { + Title string + YMax float64 + YMin float64 + YLabel string + Data string +} + +type githubSummary struct { + client api.Client + Pods []string +} + +func NewSummary(c api.Client, pods ...string) githubSummary { + return githubSummary{ + client: c, + Pods: pods, + } +} + +// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data. +// title - Display name of the xychart +// pod - Pod name with which to filter results from prometheus +// query - Prometheus query +// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc. +// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable +// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000. +func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) { + v1api := v1.NewAPI(s.client) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fullQuery := fmt.Sprintf(query, pod) + result, warnings, err := v1api.Query(ctx, fullQuery, time.Now()) + if err != nil { + return "", err + } else if len(warnings) > 0 { + fmt.Printf("warnings returned from performance query; query=%s, warnings=%v", fullQuery, warnings) + } else if result.Type() != model.ValMatrix { + return "", fmt.Errorf("incompatible result type; need: %s, got: %s", model.ValMatrix, result.Type().String()) + } + + matrix, ok := result.(model.Matrix) + if !ok { + return "", fmt.Errorf("typecast for metrics samples failed; aborting") + } else if len(matrix) > 1 { + return "", fmt.Errorf("expected 1 set of results; got: %d", len(matrix)) + } + chart := xychart{ + Title: title, + YLabel: yLabel, + YMax: math.SmallestNonzeroFloat64, + YMin: math.MaxFloat64, + } + formattedData := make([]string, 0) + // matrix does not allow [] access, so we just do one iteration for the single result + for _, metric := range matrix { + if len(metric.Values) < 1 { + return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values)) + } + for _, sample := range metric.Values { + floatSample := float64(sample.Value) * scaler + formattedData = append(formattedData, fmt.Sprintf("%f", floatSample)) + if floatSample > chart.YMax { + chart.YMax = floatSample + } + if floatSample < chart.YMin { + chart.YMin = floatSample + } + } + } + // Add some padding + chart.YMax = (chart.YMax + (math.Abs(chart.YMax) * 0.05)) + chart.YMin = (chart.YMin - (math.Abs(chart.YMin) * 0.05)) + // Pretty print the values, ex: [1,2,3,4] + chart.Data = strings.ReplaceAll(fmt.Sprintf("%v", formattedData), " ", ",") + + return executeTemplate(chartTemplate, chart) +} + +// Alerts queries the prometheus server for alerts and generates markdown output for anything found. +// If no alerts are found, the alerts section will contain only "None." in the final output. +func (s githubSummary) Alerts() (string, error) { + v1api := v1.NewAPI(s.client) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + result, err := v1api.Alerts(ctx) + if err != nil { + return "", err + } + + firingAlerts := make([]summaryAlert, 0) + pendingAlerts := make([]summaryAlert, 0) + if len(result.Alerts) > 0 { + for _, a := range result.Alerts { + aConv := summaryAlert{ + Alert: a, + Name: string(a.Labels["alertname"]), + Description: string(a.Annotations["description"]), + } + switch a.State { + case v1.AlertStateFiring: + firingAlerts = append(firingAlerts, aConv) + case v1.AlertStatePending: + pendingAlerts = append(pendingAlerts, aConv) + // Ignore AlertStateInactive; the alerts endpoint doesn't return them + } + } + } else { + return "None.", nil + } + + return executeTemplate(alertsTemplate, summaryAlerts{ + FiringAlerts: firingAlerts, + PendingAlerts: pendingAlerts, + }) +} + +func executeTemplate(templateFile string, obj any) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", fmt.Errorf("failed to get working directory: %w", err) + } + tmpl, err := template.New(templateFile).ParseGlob(filepath.Join(wd, "../utils/templates", templateFile)) + if err != nil { + return "", err + } + buffer := new(strings.Builder) + err = tmpl.Execute(buffer, obj) + if err != nil { + return "", err + } + return buffer.String(), nil +} + +// PrintSummary executes the main summary template, generating the full test report. +// The markdown is template-driven; the summary methods are called from within the +// template. This allows us to add or change queries (hopefully) without needing to +// touch code. The summary will be output to a file supplied by the env target. +func PrintSummary(envTarget string) error { + client, err := api.NewClient(api.Config{ + Address: defaultPromUrl, + }) + if err != nil { + fmt.Printf("Error creating prometheus client: %v\n", err) + os.Exit(1) + } + + summary := NewSummary(client, "operator-controller", "catalogd") + summaryMarkdown, err := executeTemplate(summaryTemplate, summary) + if err != nil { + return err + } + if path := os.Getenv(envTarget); path != "" { + err = os.WriteFile(path, []byte(summaryMarkdown), 0o600) + if err != nil { + return err + } + fmt.Printf("Test summary output to %s successful\n", envTarget) + } else { + fmt.Printf("No summary output specified; skipping") + } + return nil +} diff --git a/test/utils/templates/alert.md.tmpl b/test/utils/templates/alert.md.tmpl new file mode 100644 index 000000000..39f3e4287 --- /dev/null +++ b/test/utils/templates/alert.md.tmpl @@ -0,0 +1,16 @@ +{{- /* -------------------- Alert Template --------------------- */ -}} +{{define "alert"}} +| {{ .Name }} | {{ .Description }} | +| -------- | ------- | +| ActiveAt | {{ .ActiveAt }} | +| State | {{ .State }} | +{{- end}} + +### Firing Alerts +{{ range .FiringAlerts }} +{{ template "alert" .}} +{{ end }} +### Pending Alerts +{{ range .PendingAlerts }} +{{ template "alert" .}} +{{ end }} diff --git a/test/utils/templates/mermaid_chart.md.tmpl b/test/utils/templates/mermaid_chart.md.tmpl new file mode 100644 index 000000000..0a8ed1135 --- /dev/null +++ b/test/utils/templates/mermaid_chart.md.tmpl @@ -0,0 +1,17 @@ +
+ +```mermaid +--- +config: + xyChart: + showDataLabel: true + xAxis: + showLabel: false +--- +xychart-beta +title "{{ .Title }}" +y-axis "{{ .YLabel }}" {{printf "%f" .YMin}} --> {{printf "%f" .YMax}} +x-axis "time (start of test to end)" +line {{.Data}} +``` +
diff --git a/test/utils/templates/summary.md.tmpl b/test/utils/templates/summary.md.tmpl new file mode 100644 index 000000000..c094d49f3 --- /dev/null +++ b/test/utils/templates/summary.md.tmpl @@ -0,0 +1,22 @@ + +{{- /* ------------ Performance Statistics Template ------------ */ -}} +{{define "performanceStatistics" -}} +{{ range $index, $pod := .Pods }} +### {{$pod}} +#### Memory Usage +{{$.PerformanceQuery "Memory Usage" $pod `container_memory_working_set_bytes{pod=~"%s.*",container="manager"}[5m]` "MB" .000001}} + +#### Memory Growth Rate +{{$.PerformanceQuery "Memory Growth Rate" $pod `deriv(sum(container_memory_working_set_bytes{pod=~"%s.*",container="manager"})[5m:])[5m:]` "KB/s" .001}} + +#### CPU Usage +{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}} +{{end}} +{{- end}} + +{{- /* ----------------- E2E Summary Markdown ------------------ */ -}} +# E2E Summary +## Alerts +{{.Alerts}} +## Performance +{{ template "performanceStatistics" . -}} diff --git a/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml b/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml index a566e3595..3520f53db 100644 --- a/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml +++ b/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml @@ -58,7 +58,7 @@ spec: terminationGracePeriodSeconds: 0 containers: - name: busybox - image: busybox + image: busybox:1.36 command: - 'sleep' - '1000'