From 990511cf0bfb33ba57081a1ccd29ba832b7af4ac Mon Sep 17 00:00:00 2001 From: Paschalis T Date: Thu, 17 Jul 2025 13:11:56 +0300 Subject: [PATCH 1/8] Introduce OTel Collector mixin Signed-off-by: Paschalis T --- opentelemetry-collector-mixin/LICENSE | 201 ++ opentelemetry-collector-mixin/Makefile | 54 + opentelemetry-collector-mixin/README.md | 115 + opentelemetry-collector-mixin/alerts.jsonnet | 1 + .../alerts/alerts.libsonnet | 25 + .../config.libsonnet | 23 + .../dashboards.jsonnet | 12 + .../dashboards/collector.libsonnet | 135 ++ .../dashboards/dashboards.libsonnet | 1 + .../dashboards/utils/panels.libsonnet | 207 ++ .../dashboards/utils/queries.libsonnet | 505 ++++ .../dashboards/utils/variables.libsonnet | 51 + .../dashboards_out/.lint | 7 + .../dashboards_out/collector-before.json | 2096 +++++++++++++++++ .../dashboards_out/collector.json | 2096 +++++++++++++++++ .../jsonnetfile.json | 15 + opentelemetry-collector-mixin/mixin.libsonnet | 4 + .../prometheus_alerts.yaml | 12 + .../prometheus_rules.yaml | 3 + opentelemetry-collector-mixin/rules.jsonnet | 1 + .../rules/rules.libsonnet | 11 + 21 files changed, 5575 insertions(+) create mode 100644 opentelemetry-collector-mixin/LICENSE create mode 100644 opentelemetry-collector-mixin/Makefile create mode 100644 opentelemetry-collector-mixin/README.md create mode 100644 opentelemetry-collector-mixin/alerts.jsonnet create mode 100644 opentelemetry-collector-mixin/alerts/alerts.libsonnet create mode 100644 opentelemetry-collector-mixin/config.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards.jsonnet create mode 100644 opentelemetry-collector-mixin/dashboards/collector.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards/dashboards.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet create mode 100644 opentelemetry-collector-mixin/dashboards_out/.lint create mode 100644 opentelemetry-collector-mixin/dashboards_out/collector-before.json create mode 100644 opentelemetry-collector-mixin/dashboards_out/collector.json create mode 100644 opentelemetry-collector-mixin/jsonnetfile.json create mode 100644 opentelemetry-collector-mixin/mixin.libsonnet create mode 100644 opentelemetry-collector-mixin/prometheus_alerts.yaml create mode 100644 opentelemetry-collector-mixin/prometheus_rules.yaml create mode 100644 opentelemetry-collector-mixin/rules.jsonnet create mode 100644 opentelemetry-collector-mixin/rules/rules.libsonnet diff --git a/opentelemetry-collector-mixin/LICENSE b/opentelemetry-collector-mixin/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/opentelemetry-collector-mixin/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/opentelemetry-collector-mixin/Makefile b/opentelemetry-collector-mixin/Makefile new file mode 100644 index 000000000..efe0be7da --- /dev/null +++ b/opentelemetry-collector-mixin/Makefile @@ -0,0 +1,54 @@ +JSONNET_VENDOR=vendor +DASHBOARDS_OUT_DIR ?=dashboards_out +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +.PHONY: all +all: fmt generate lint + +.PHONY: generate +generate: prometheus_alerts.yaml prometheus_rules.yaml $(DASHBOARDS_OUT_DIR) + +.PHONY: fmt +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- jsonnetfmt -i + +.PHONY: lint +lint: jsonnet-lint alerts-lint dashboards-lint + +.PHONY: jsonnet-lint +jsonnet-lint: + @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- jsonnet-lint -J vendor + +.PHONY: alerts-lint +alerts-lint: prometheus_alerts.yaml prometheus_rules.yaml + promtool check rules prometheus_rules.yaml + promtool check rules prometheus_alerts.yaml + +.PHONY: dashboards-lint +dashboards-lint: + @find $(DASHBOARDS_OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 dashboard-linter lint --strict + +.PHONY: prometheus_alerts.yaml +prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts/*.libsonnet + jsonnet -S alerts.jsonnet >$@ + +.PHONY: prometheus_rules.yaml +prometheus_rules.yaml: mixin.libsonnet config.libsonnet rules/*.libsonnet + jsonnet -S rules.jsonnet >$@ + + +.PHONY: dashboards_out +dashboards_out: mixin.libsonnet config.libsonnet dashboards/*.libsonnet + @mkdir -p dashboards_out + cp .lint dashboards_out/.lint + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +.PHONY: clean +clean: + rm -rf prometheus_alerts.yaml prometheus_rules.yaml dashboards_out + +.PHONY: jb_install +jb_install: + jb install diff --git a/opentelemetry-collector-mixin/README.md b/opentelemetry-collector-mixin/README.md new file mode 100644 index 000000000..d19c71264 --- /dev/null +++ b/opentelemetry-collector-mixin/README.md @@ -0,0 +1,115 @@ +# opentelemetry-collector-mixin + +Prometheus Monitoring Mixin for the OpenTelemetry Collector + +This mixin contains a set of Prometheus alert rules and Grafana dashboards +based on the metrics exported by the OpenTelemetry Collector's [internal +telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/). + +To use it, you need to have `jsonnet` (any sufficiently modern version should +do, but ideally v0.20+) and `jb` installed. + +If you have a working Go development environment, you can run the following to +get started: +``` +go install github.com/google/go-jsonnet/cmd/jsonnet@latest +go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest +``` + +### Usage + +First, install the dependencies by running the following command from the repo +root: +``` +$ jb install +``` + +You can then build the Prometheus alert and recording rules by running: +``` +$ make prometheus_alerts.yaml +$ make prometheus_rules.yaml +``` + +You can also render a JSON dashboard file for Grafana by running the following +command. The results are stored in the `dashboards_out/` directory. +``` +$ make dashboards_out +``` + +### OpenTelemetry Collector configuration + +By default, the OpenTelemetry Collector exposes its [internal +telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/) as +prometheus metrics on port 8888. + +The following configuration can be used as a starting point for scraping and +sending metrics in a Prometheus-compatible store. + +```yaml +extensions: + basicauth/remote_write: + client_auth: + username: "username" + password: "password" + +receivers: + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 15s + static_configs: + - targets: ['0.0.0.0:8888'] + +processors: + batch: + +exporters: + prometheusremotewrite: + endpoint: "http://prometheus/api/prom/push" + auth: + authenticator: basicauth/remote_write + resource_to_telemetry_conversion: + enabled: true # Convert resource attributes to metric labels + +service: + telemetry: + metrics: + level: "detailed" + readers: + - pull: + exporter: + prometheus: + host: '0.0.0.0' + port: 8888 + extensions: [basicauth/remote_write] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [prometheusremotewrite] +``` + +### Other requirements + +The Makefile contains commands for formatting, linting and testing the mixin. +For development purposes you may need one or more of the following as well. +``` +go install github.com/google/go-jsonnet/cmd/jsonnet-lint@latest +go install github.com/grafana/dashboard-linter@latest +go install github.com/prometheus/prometheus/cmd/promtool@latest +go install github.com/monitoring-mixins/mixtool/cmd/mixtool@main +``` + +### Contributing + +To contribute: + +1. Fork the repository +2. Make your changes +3. Run `make all` to verify your changes and test in a Prometheus/Grafana environment. Screenshots are welcome for new panels/dashboards. +4. Submit a pull request + +If you want to make some parameter configurable, use `config.libsonnet` as an +entrypoint. + diff --git a/opentelemetry-collector-mixin/alerts.jsonnet b/opentelemetry-collector-mixin/alerts.jsonnet new file mode 100644 index 000000000..75e7c1b29 --- /dev/null +++ b/opentelemetry-collector-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/opentelemetry-collector-mixin/alerts/alerts.libsonnet b/opentelemetry-collector-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..e0fd5f534 --- /dev/null +++ b/opentelemetry-collector-mixin/alerts/alerts.libsonnet @@ -0,0 +1,25 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'otelcol', + rules: [ + { + alert: 'OtelcolSendingQueueFull', + expr: ||| + otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity + |||, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The sending queue has filled up', + description: 'The sending queue is full; the collector might start dropping data', + }, + }, + ], + }, + ], + }, +} diff --git a/opentelemetry-collector-mixin/config.libsonnet b/opentelemetry-collector-mixin/config.libsonnet new file mode 100644 index 000000000..5fa45b7a0 --- /dev/null +++ b/opentelemetry-collector-mixin/config.libsonnet @@ -0,0 +1,23 @@ +{ + _config+:: { + // Grafana dashboard IDs are necessary for stable links for dashboards + grafanaDashboardIDs: { + 'collector.json': std.md5('collector.json'), + }, + + // Config for the Grafana dashboards in the Kubernetes Mixin + grafana: { + // The default refresh time for all dashboards, default to 10s + refresh: '10s', + + // Timezone for Grafana dashboards:: UTC, browser, ... + grafanaTimezone: 'UTC', + + // Tags for Grafana dashboards + dashboardTags: ['otelcol'], + }, + + // Default datasource name + datasourceName: 'default', + }, +} diff --git a/opentelemetry-collector-mixin/dashboards.jsonnet b/opentelemetry-collector-mixin/dashboards.jsonnet new file mode 100644 index 000000000..b7d3e1e95 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards.jsonnet @@ -0,0 +1,12 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; +local cfg = import 'config.libsonnet'; + +{ + [name]: dashboards[name] { + uid: std.get(cfg._config.grafanaDashboardIDs, name, default=std.md5(name)), + timezone: cfg._config.grafana.grafanaTimezone, + refresh: cfg._config.grafana.refresh, + tags: cfg._config.grafana.dashboardTags, + } + for name in std.objectFields(dashboards) +} diff --git a/opentelemetry-collector-mixin/dashboards/collector.libsonnet b/opentelemetry-collector-mixin/dashboards/collector.libsonnet new file mode 100644 index 000000000..601eb3957 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/collector.libsonnet @@ -0,0 +1,135 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local row = g.panel.row; +local variables = import './utils/variables.libsonnet'; +local panels = import './utils/panels.libsonnet'; +local queries = import './utils/queries.libsonnet'; + +{ + grafanaDashboards+:: { + 'collector.json': + g.dashboard.new( + 'OpenTelemetry Collector Health', + ) + + g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.') + + g.dashboard.graphTooltip.withSharedCrosshair() + + g.dashboard.withVariables([ + variables.datasourceVariable, + variables.jobVariable, + variables.clusterVariable, + variables.namespaceVariable, + variables.instanceVariable, + ]) + + g.dashboard.withPanels( + g.util.grid.wrapPanels([ + // Overview row + row.new('Overview'), + panels.stat.base('Running Collectors', [queries.runningCollectors]), + panels.table.uptime('Collector uptime', [queries.collectorUptime]), + + // Resources row + row.new('Resources'), + panels.timeSeries.cpuUsage('CPU usage', [queries.cpuUsage]) + + { gridPos: { w: 8 } }, + panels.timeSeries.memoryUsage('Memory (RSS)', queries.memUsageRSS) + + { gridPos: { w: 8 } }, + panels.timeSeries.memoryUsage('Memory (Heap Alloc)', queries.memUsageHeapAlloc) + + { gridPos: { w: 8 } }, + + // Receivers row + row.new('Receivers'), + panels.timeSeries.short('Accepted metric points', [queries.acceptedMetricPoints]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Accepted log records', [queries.acceptedLogRecords]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Accepted spans', [queries.acceptedSpans]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Total incoming items', [queries.incomingItems]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused metric points', [queries.refusedMetricPoints]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused log records', [queries.refusedLogRecords]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused spans', [queries.refusedSpans]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Total outgoing items', [queries.outgoingItems]) + + { gridPos: { w: 6 } }, + + // Processors row + row.new('Processors'), + panels.heatmap.base('Number of units in the batch', [queries.batchSendSize]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Batch cardinality', [queries.batchCardinality]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Queue current size vs capacity', [queries.queueSize, queries.queueCapacity]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Batch size send trigger', [queries.batchSizeSendTrigger]), + panels.timeSeries.short('Batch timeout send trigger', [queries.batchTimeoutSendTrigger]), + + // Exporters row + row.new('Exporters'), + panels.timeSeries.short('Exported metrics', [queries.exportedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Exported logs', [queries.exportedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Exported spans', [queries.exportedSpans]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed metrics', [queries.failedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed logs', [queries.failedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed spans', [queries.failedSpans]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed metrics', [queries.enqueueFailedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed logs', [queries.enqueueFailedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed spans', [queries.enqueueFailedSpans]) + + { gridPos: { w: 8 } }, + + // Network traffic row + row.new('Network traffic'), + panels.timeSeries.seconds('Inbound gRPC request duration percentiles', [ + queries.grpcInboundDurationP50, + queries.grpcInboundDurationP90, + queries.grpcInboundDurationP99, + ]), + panels.timeSeries.seconds('Inbound HTTP request duration percentiles', [ + queries.httpInboundDurationP50, + queries.httpInboundDurationP90, + queries.httpInboundDurationP99, + ]), + panels.timeSeries.bytes('Inbound gRPC request size percentiles', [ + queries.grpcInboundSizeP50, + queries.grpcInboundSizeP90, + queries.grpcInboundSizeP99, + ]), + panels.timeSeries.bytes('Inbound HTTP request size percentiles', [ + queries.httpInboundSizeP50, + queries.httpInboundSizeP90, + queries.httpInboundSizeP99, + ]), + panels.timeSeries.seconds('Outgoing gRPC request duration percentiles', [ + queries.grpcOutboundDurationP50, + queries.grpcOutboundDurationP90, + queries.grpcOutboundDurationP99, + ]), + panels.timeSeries.seconds('Outgoing HTTP request duration percentiles', [ + queries.httpOutboundDurationP50, + queries.httpOutboundDurationP90, + queries.httpOutboundDurationP99, + ]), + panels.timeSeries.bytes('Outgoing gRPC request size percentiles', [ + queries.grpcOutboundSizeP50, + queries.grpcOutboundSizeP90, + queries.grpcOutboundSizeP99, + ]), + panels.timeSeries.bytes('Outgoing HTTP request size percentiles', [ + queries.httpOutboundSizeP50, + queries.httpOutboundSizeP90, + queries.httpOutboundSizeP99, + ]), + + ], panelWidth=12, panelHeight=8), + ), + }, +} diff --git a/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet b/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 000000000..dba171f18 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1 @@ +(import 'collector.libsonnet') diff --git a/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet new file mode 100644 index 000000000..3958ef48b --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet @@ -0,0 +1,207 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +{ + timeSeries: { + local timeSeries = g.panel.timeSeries, + local fieldOverride = g.panel.timeSeries.fieldOverride, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + base(title, targets): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.queryOptions.withInterval('1m') + + options.legend.withDisplayMode('table') + + options.legend.withCalcs([ + 'lastNotNull', + 'max', + ]) + + custom.withFillOpacity(10) + + custom.withShowPoints('never') + + timeSeries.panelOptions.withDescription(title), + + short(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('short') + + timeSeries.standardOptions.withDecimals(0), + + seconds(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('s') + + custom.scaleDistribution.withType('log') + + custom.scaleDistribution.withLog(10), + + cpuUsage: self.seconds, + + bytes(title, targets): + self.base(title, targets,) + + timeSeries.standardOptions.withUnit('bytes') + + custom.scaleDistribution.withType('log') + + custom.scaleDistribution.withLog(2), + + memoryUsage(title, targets): + self.bytes(title, targets) + + timeSeries.standardOptions.withOverrides([ + fieldOverride.byRegexp.new('/(virtual|resident)/i') + + fieldOverride.byRegexp.withProperty( + 'custom.fillOpacity', + 0 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineWidth', + 2 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineStyle', + { + dash: [10, 10], + fill: 'dash', + } + ), + ]), + + durationQuantile(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('s') + + custom.withDrawStyle('bars') + + timeSeries.standardOptions.withOverrides([ + fieldOverride.byRegexp.new('/mean/i') + + fieldOverride.byRegexp.withProperty( + 'custom.fillOpacity', + 0 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineStyle', + { + dash: [8, 10], + fill: 'dash', + } + ), + ]), + + milliseconds(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('ms'), + + cps(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('cps'), + }, + + heatmap: { + local heatmap = g.panel.heatmap, + local options = heatmap.options, + + base(title, targets): + heatmap.new(title) + + heatmap.queryOptions.withTargets(targets) + + heatmap.queryOptions.withInterval('1m') + + + options.withCalculate() + + options.calculation.xBuckets.withMode('size') + + options.calculation.xBuckets.withValue('1min') + + options.withCellGap(2) + + options.color.withMode('scheme') + + options.color.withScheme('Spectral') + + options.color.withSteps(128) + + options.yAxis.withDecimals(0) + + options.yAxis.withUnit('short') + + heatmap.panelOptions.withDescription(title), + }, + + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, targets): + stat.new(title) + + stat.queryOptions.withTargets(targets) + + + options.withColorMode('value') + + options.withGraphMode('none') + + options.withJustifyMode('center') + + options.withOrientation('auto') + + options.reduceOptions.withCalcs(['lastNotNull']) + + options.reduceOptions.withFields('') + + options.reduceOptions.withValues(false) + + options.withShowPercentChange(false) + + options.withTextMode('auto') + + options.withWideLayout(true) + + stat.standardOptions.withUnit('none') + + stat.panelOptions.withDescription(title), + }, + + table: { + local table = g.panel.table, + local options = table.options, + + base(title, targets): + table.new(title) + + table.queryOptions.withTargets(targets) + + table.queryOptions.withInterval('1m') + + + options.withCellHeight('sm') + + options.withFrameIndex(0) + + options.withShowHeader(true) + + options.footer.withShow(false) + + options.footer.withCountRows(false) + + options.footer.withFields('') + + options.footer.withReducer(['sum']) + + table.panelOptions.withDescription(title), + + uptime(title, targets): + self.base(title, targets) + + table.standardOptions.withUnit('s') + + table.queryOptions.withTransformations([ + { + id: 'organize', + options: { + excludeByName: { + Time: true, + job: true, + __name__: true, + }, + includeByName: { + cluster: true, + namespace: true, + instance: true, + service_version: true, + version: true, + Value: true, + }, + indexByName: { + cluster: 0, + namespace: 1, + instance: 2, + service_version: 3, + version: 3, + Value: 4, + }, + renameByName: { + Value: 'Uptime', + }, + }, + }, + ]) + + table.standardOptions.withOverrides([ + { + matcher: { + id: 'byName', + options: 'Uptime', + }, + properties: [ + { + id: 'custom.displayMode', + value: 'basic', + }, + ], + }, + ]) + + table.options.withSortBy([ + { + displayName: 'Uptime', + desc: true, + }, + ]), + + }, +} diff --git a/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet new file mode 100644 index 000000000..48263a133 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet @@ -0,0 +1,505 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local prometheusQuery = g.query.prometheus; +local variables = import './variables.libsonnet'; + +{ + // Existing queries (modified to work with instance variable) + cpuUsage: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + rate( + { + __name__=~"otelcol_process_cpu_seconds(_total)?", + job=~"$job", + cluster=~"$cluster", + namespace=~"$namespace", + instance=~"$instance" + } + [$__rate_interval]) + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + {{cluster}} - {{namespace}} - {{instance}} + |||), + + memUsageRSS: + [ + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + {__name__=~"otelcol_process_memory_rss(_bytes)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + RSS - {{cluster}} - {{namespace}} - {{instance}} + |||), + ], + + memUsageHeapAlloc: + [ + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + {__name__=~"otelcol_process_runtime_total_sys_memory_bytes(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + RSS - {{cluster}} - {{namespace}} - {{instance}} + |||), + ], + + // Fleet Overview queries + runningCollectors: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + count({__name__=~"otelcol_process_uptime(_seconds_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ), + + collectorUptime: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + {__name__=~"otelcol_process_uptime(_seconds_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ||| + ) + + prometheusQuery.withFormat('table') + + prometheusQuery.withInstant(true), + + // Receivers status queries + acceptedMetricPoints: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + acceptedLogRecords: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + acceptedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + incomingItems: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_incoming_items(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedMetricPoints: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedLogRecords: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + outgoingItems: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_outgoing_items(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Processors status queries + batchSendSize: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance, le) (increase({__name__=~"otelcol_processor_batch_batch_send_size_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} - {{le}}'), + + batchCardinality: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_processor_batch_metadata_cardinality(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + queueSize: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_exporter_queue_size(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} queue current size'), + + queueCapacity: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_exporter_queue_capacity(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} queue capacity'), + + batchSizeSendTrigger: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_batch_timeout_trigger_send(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ), + + batchTimeoutSendTrigger: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_batch_timeout_trigger_send(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Exporters status queries + exportedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + exportedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ), + + exportedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Network traffic queries + grpcInboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), +} diff --git a/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet new file mode 100644 index 000000000..362ccd2de --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet @@ -0,0 +1,51 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variable = g.dashboard.variable; +local cfg = import '../../config.libsonnet'; + +{ + datasourceVariable: + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withLabel('Data source') + + variable.datasource.generalOptions.withCurrent(cfg._config.datasourceName) + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue(), + + clusterVariable: + variable.query.new('cluster') + + variable.query.generalOptions.withLabel('Cluster') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('cluster', metric='{__name__=~"otelcol_process_uptime.*"}'), + + namespaceVariable: + variable.query.new('namespace') + + variable.query.generalOptions.withLabel('Namespace') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('namespace', metric='{__name__=~"otelcol_process_uptime.*"}'), + + jobVariable: + variable.query.new('job') + + variable.query.generalOptions.withLabel('Job') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('job', metric='{__name__=~"otelcol_process_uptime.*"}'), + + instanceVariable: + variable.query.new('instance') + + variable.query.generalOptions.withLabel('Instance') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('instance', metric='{__name__=~"otelcol_process_uptime.*"}'), +} diff --git a/opentelemetry-collector-mixin/dashboards_out/.lint b/opentelemetry-collector-mixin/dashboards_out/.lint new file mode 100644 index 000000000..bc18d39f7 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards_out/.lint @@ -0,0 +1,7 @@ +exclusions: + template-job-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + template-instance-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + panel-datasource-rule: + reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option. diff --git a/opentelemetry-collector-mixin/dashboards_out/collector-before.json b/opentelemetry-collector-mixin/dashboards_out/collector-before.json new file mode 100644 index 000000000..3a2c41ada --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards_out/collector-before.json @@ -0,0 +1,2096 @@ +{ + "description": "A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.", + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Running Collectors", + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count({__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n" + } + ], + "title": "Running Collectors", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Collector uptime", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Uptime" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "basic" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "interval": "1m", + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": [ + "" + ], + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Uptime" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "{__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n", + "format": "table", + "instant": true + } + ], + "title": "Collector uptime", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "job": true + }, + "includeByName": { + "Value": true, + "cluster": true, + "instance": true, + "namespace": true, + "service_version": true, + "version": true + }, + "indexByName": { + "Value": 4, + "cluster": 0, + "instance": 2, + "namespace": 1, + "service_version": 3, + "version": 3 + }, + "renameByName": { + "Value": "Uptime" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 9 + }, + "id": 4, + "panels": [ ], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "CPU usage", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 5, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n rate(\n {\n __name__=~\"otelcol_process_cpu_seconds(_total)?\",\n job=~\"$job\",\n cluster=~\"$cluster\",\n namespace=~\"$namespace\",\n instance=~\"$instance\"\n }\n [$__rate_interval])\n)\n", + "intervalFactor": 2, + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "CPU usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Memory (RSS)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/(virtual|resident)/i" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 2 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 6, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_memory_rss(_bytes)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", + "intervalFactor": 2, + "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "Memory (RSS)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Memory (Heap Alloc)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/(virtual|resident)/i" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 2 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 7, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_runtime_total_sys_memory_bytes(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", + "intervalFactor": 2, + "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "Memory (Heap Alloc)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 18 + }, + "id": 8, + "panels": [ ], + "title": "Receivers", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted metric points", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 19 + }, + "id": 9, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted metric points", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted log records", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 19 + }, + "id": 10, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted log records", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 19 + }, + "id": 11, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Total incoming items", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 19 + }, + "id": 12, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_incoming_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Total incoming items", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused metric points", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 27 + }, + "id": 13, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused metric points", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused log records", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 27 + }, + "id": 14, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused log records", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 27 + }, + "id": 15, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Total outgoing items", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 27 + }, + "id": 16, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_outgoing_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Total outgoing items", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 35 + }, + "id": 17, + "panels": [ ], + "title": "Processors", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Number of units in the batch", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 18, + "interval": "1m", + "options": { + "calculate": true, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "1min" + } + }, + "cellGap": 2, + "color": { + "mode": "scheme", + "scheme": "Spectral", + "steps": 128 + }, + "yAxis": { + "decimals": 0, + "unit": "short" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance, le) (increase({__name__=~\"otelcol_processor_batch_batch_send_size_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} - {{le}}" + } + ], + "title": "Number of units in the batch", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch cardinality", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 19, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_processor_batch_metadata_cardinality(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Batch cardinality", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Queue current size vs capacity", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 20, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_size(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue current size" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_capacity(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue capacity" + } + ], + "title": "Queue current size vs capacity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch size send trigger", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 21, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" + } + ], + "title": "Batch size send trigger", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch timeout send trigger", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 22, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Batch timeout send trigger", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 52 + }, + "id": 23, + "panels": [ ], + "title": "Exporters", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 53 + }, + "id": 24, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Exported metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 53 + }, + "id": 25, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" + } + ], + "title": "Exported logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 53 + }, + "id": 26, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Exported spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 61 + }, + "id": 27, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 61 + }, + "id": 28, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 61 + }, + "id": 29, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 69 + }, + "id": 30, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 69 + }, + "id": 31, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 69 + }, + "id": 32, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed spans", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 77 + }, + "id": 33, + "panels": [ ], + "title": "Network traffic", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound gRPC request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 34, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound gRPC request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound HTTP request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 35, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound HTTP request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound gRPC request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 36, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound gRPC request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound HTTP request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 37, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound HTTP request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing gRPC request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 38, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing gRPC request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing HTTP request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 39, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing HTTP request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing gRPC request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 102 + }, + "id": 40, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing gRPC request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing HTTP request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 102 + }, + "id": 41, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing HTTP request size percentiles", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "otelcol" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, job)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, cluster)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, namespace)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, instance)", + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "UTC", + "title": "OpenTelemetry Collector Health", + "uid": "3219d83e205d394f293831f6334ab368" +} diff --git a/opentelemetry-collector-mixin/dashboards_out/collector.json b/opentelemetry-collector-mixin/dashboards_out/collector.json new file mode 100644 index 000000000..3a2c41ada --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards_out/collector.json @@ -0,0 +1,2096 @@ +{ + "description": "A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.", + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Running Collectors", + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count({__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n" + } + ], + "title": "Running Collectors", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Collector uptime", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Uptime" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "basic" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "interval": "1m", + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": [ + "" + ], + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Uptime" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "{__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n", + "format": "table", + "instant": true + } + ], + "title": "Collector uptime", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "job": true + }, + "includeByName": { + "Value": true, + "cluster": true, + "instance": true, + "namespace": true, + "service_version": true, + "version": true + }, + "indexByName": { + "Value": 4, + "cluster": 0, + "instance": 2, + "namespace": 1, + "service_version": 3, + "version": 3 + }, + "renameByName": { + "Value": "Uptime" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 9 + }, + "id": 4, + "panels": [ ], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "CPU usage", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 5, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n rate(\n {\n __name__=~\"otelcol_process_cpu_seconds(_total)?\",\n job=~\"$job\",\n cluster=~\"$cluster\",\n namespace=~\"$namespace\",\n instance=~\"$instance\"\n }\n [$__rate_interval])\n)\n", + "intervalFactor": 2, + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "CPU usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Memory (RSS)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/(virtual|resident)/i" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 2 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 6, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_memory_rss(_bytes)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", + "intervalFactor": 2, + "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "Memory (RSS)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Memory (Heap Alloc)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/(virtual|resident)/i" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 2 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 7, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_runtime_total_sys_memory_bytes(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", + "intervalFactor": 2, + "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" + } + ], + "title": "Memory (Heap Alloc)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 18 + }, + "id": 8, + "panels": [ ], + "title": "Receivers", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted metric points", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 19 + }, + "id": 9, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted metric points", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted log records", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 19 + }, + "id": 10, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted log records", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Accepted spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 19 + }, + "id": 11, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Accepted spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Total incoming items", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 19 + }, + "id": 12, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_incoming_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Total incoming items", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused metric points", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 27 + }, + "id": 13, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused metric points", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused log records", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 27 + }, + "id": 14, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused log records", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Refused spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 27 + }, + "id": 15, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Refused spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Total outgoing items", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 27 + }, + "id": 16, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_outgoing_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Total outgoing items", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 35 + }, + "id": 17, + "panels": [ ], + "title": "Processors", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Number of units in the batch", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 18, + "interval": "1m", + "options": { + "calculate": true, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "1min" + } + }, + "cellGap": 2, + "color": { + "mode": "scheme", + "scheme": "Spectral", + "steps": 128 + }, + "yAxis": { + "decimals": 0, + "unit": "short" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (job, cluster, namespace, instance, le) (increase({__name__=~\"otelcol_processor_batch_batch_send_size_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} - {{le}}" + } + ], + "title": "Number of units in the batch", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch cardinality", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 19, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_processor_batch_metadata_cardinality(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Batch cardinality", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Queue current size vs capacity", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 20, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_size(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue current size" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_capacity(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue capacity" + } + ], + "title": "Queue current size vs capacity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch size send trigger", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 21, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" + } + ], + "title": "Batch size send trigger", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Batch timeout send trigger", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 22, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Batch timeout send trigger", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 52 + }, + "id": 23, + "panels": [ ], + "title": "Exporters", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 53 + }, + "id": 24, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Exported metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 53 + }, + "id": 25, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" + } + ], + "title": "Exported logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Exported spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 53 + }, + "id": 26, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Exported spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 61 + }, + "id": 27, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 61 + }, + "id": 28, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Failed spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 61 + }, + "id": 29, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Failed spans", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed metrics", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 69 + }, + "id": 30, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed logs", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 69 + }, + "id": 31, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed logs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Enqueue failed spans", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "decimals": 0, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 69 + }, + "id": 32, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Enqueue failed spans", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 77 + }, + "id": 33, + "panels": [ ], + "title": "Network traffic", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound gRPC request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 34, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound gRPC request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound HTTP request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 35, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound HTTP request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound gRPC request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 36, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound gRPC request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Inbound HTTP request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 37, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Inbound HTTP request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing gRPC request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 38, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing gRPC request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing HTTP request duration percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 39, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing HTTP request duration percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing gRPC request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 102 + }, + "id": 40, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing gRPC request size percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Outgoing HTTP request size percentiles", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 102 + }, + "id": 41, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", + "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" + } + ], + "title": "Outgoing HTTP request size percentiles", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "otelcol" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, job)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, cluster)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, namespace)", + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, instance)", + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "UTC", + "title": "OpenTelemetry Collector Health", + "uid": "3219d83e205d394f293831f6334ab368" +} diff --git a/opentelemetry-collector-mixin/jsonnetfile.json b/opentelemetry-collector-mixin/jsonnetfile.json new file mode 100644 index 000000000..2414c8671 --- /dev/null +++ b/opentelemetry-collector-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "main" + } + ], + "legacyImports": true +} diff --git a/opentelemetry-collector-mixin/mixin.libsonnet b/opentelemetry-collector-mixin/mixin.libsonnet new file mode 100644 index 000000000..152721db9 --- /dev/null +++ b/opentelemetry-collector-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') + +(import 'config.libsonnet') diff --git a/opentelemetry-collector-mixin/prometheus_alerts.yaml b/opentelemetry-collector-mixin/prometheus_alerts.yaml new file mode 100644 index 000000000..b4cab2282 --- /dev/null +++ b/opentelemetry-collector-mixin/prometheus_alerts.yaml @@ -0,0 +1,12 @@ +"groups": +- "name": "otelcol" + "rules": + - "alert": "OtelcolSendingQueueFull" + "annotations": + "description": "The sending queue is full; the collector might start dropping data" + "summary": "The sending queue has filled up" + "expr": | + otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity + "for": "30m" + "labels": + "severity": "warning" diff --git a/opentelemetry-collector-mixin/prometheus_rules.yaml b/opentelemetry-collector-mixin/prometheus_rules.yaml new file mode 100644 index 000000000..bfeea2506 --- /dev/null +++ b/opentelemetry-collector-mixin/prometheus_rules.yaml @@ -0,0 +1,3 @@ +"groups": +- "name": "otelcol-rules" + "rules": [] diff --git a/opentelemetry-collector-mixin/rules.jsonnet b/opentelemetry-collector-mixin/rules.jsonnet new file mode 100644 index 000000000..dbe13f417 --- /dev/null +++ b/opentelemetry-collector-mixin/rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) diff --git a/opentelemetry-collector-mixin/rules/rules.libsonnet b/opentelemetry-collector-mixin/rules/rules.libsonnet new file mode 100644 index 000000000..137f46c58 --- /dev/null +++ b/opentelemetry-collector-mixin/rules/rules.libsonnet @@ -0,0 +1,11 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'otelcol-rules', + rules: [ + ], + }, + ], + }, +} From 785da338d744a817d09d2578a0bb9edfe2795419 Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 13:12:55 +0300 Subject: [PATCH 2/8] Remove separate license Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/LICENSE | 201 -------------------------- 1 file changed, 201 deletions(-) delete mode 100644 opentelemetry-collector-mixin/LICENSE diff --git a/opentelemetry-collector-mixin/LICENSE b/opentelemetry-collector-mixin/LICENSE deleted file mode 100644 index 261eeb9e9..000000000 --- a/opentelemetry-collector-mixin/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. From c48784ddaff44bdd1faac38393610305e52388bc Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 13:25:47 +0300 Subject: [PATCH 3/8] Use parent Makefile instead Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/Makefile | 55 +------------------------- 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/opentelemetry-collector-mixin/Makefile b/opentelemetry-collector-mixin/Makefile index efe0be7da..b4fdca560 100644 --- a/opentelemetry-collector-mixin/Makefile +++ b/opentelemetry-collector-mixin/Makefile @@ -1,54 +1 @@ -JSONNET_VENDOR=vendor -DASHBOARDS_OUT_DIR ?=dashboards_out -JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s - -.PHONY: all -all: fmt generate lint - -.PHONY: generate -generate: prometheus_alerts.yaml prometheus_rules.yaml $(DASHBOARDS_OUT_DIR) - -.PHONY: fmt -fmt: - find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ - xargs -n 1 -- jsonnetfmt -i - -.PHONY: lint -lint: jsonnet-lint alerts-lint dashboards-lint - -.PHONY: jsonnet-lint -jsonnet-lint: - @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ - xargs -n 1 -- jsonnet-lint -J vendor - -.PHONY: alerts-lint -alerts-lint: prometheus_alerts.yaml prometheus_rules.yaml - promtool check rules prometheus_rules.yaml - promtool check rules prometheus_alerts.yaml - -.PHONY: dashboards-lint -dashboards-lint: - @find $(DASHBOARDS_OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 dashboard-linter lint --strict - -.PHONY: prometheus_alerts.yaml -prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts/*.libsonnet - jsonnet -S alerts.jsonnet >$@ - -.PHONY: prometheus_rules.yaml -prometheus_rules.yaml: mixin.libsonnet config.libsonnet rules/*.libsonnet - jsonnet -S rules.jsonnet >$@ - - -.PHONY: dashboards_out -dashboards_out: mixin.libsonnet config.libsonnet dashboards/*.libsonnet - @mkdir -p dashboards_out - cp .lint dashboards_out/.lint - jsonnet -J vendor -m dashboards_out dashboards.jsonnet - -.PHONY: clean -clean: - rm -rf prometheus_alerts.yaml prometheus_rules.yaml dashboards_out - -.PHONY: jb_install -jb_install: - jb install +include ../Makefile_mixin From a893f1c9623f36ed590c09ccab17854fa362336e Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 14:05:05 +0300 Subject: [PATCH 4/8] Remove unnecessary files Signed-off-by: Paschalis Tsilias --- .../dashboards_out/.lint | 7 - .../dashboards_out/collector-before.json | 2096 ----------------- .../dashboards_out/collector.json | 2096 ----------------- 3 files changed, 4199 deletions(-) delete mode 100644 opentelemetry-collector-mixin/dashboards_out/.lint delete mode 100644 opentelemetry-collector-mixin/dashboards_out/collector-before.json delete mode 100644 opentelemetry-collector-mixin/dashboards_out/collector.json diff --git a/opentelemetry-collector-mixin/dashboards_out/.lint b/opentelemetry-collector-mixin/dashboards_out/.lint deleted file mode 100644 index bc18d39f7..000000000 --- a/opentelemetry-collector-mixin/dashboards_out/.lint +++ /dev/null @@ -1,7 +0,0 @@ -exclusions: - template-job-rule: - reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments - template-instance-rule: - reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments - panel-datasource-rule: - reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option. diff --git a/opentelemetry-collector-mixin/dashboards_out/collector-before.json b/opentelemetry-collector-mixin/dashboards_out/collector-before.json deleted file mode 100644 index 3a2c41ada..000000000 --- a/opentelemetry-collector-mixin/dashboards_out/collector-before.json +++ /dev/null @@ -1,2096 +0,0 @@ -{ - "description": "A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.", - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 0, - "y": 0 - }, - "id": 1, - "panels": [ ], - "title": "Overview", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Running Collectors", - "fieldConfig": { - "defaults": { - "unit": "none" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "count({__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n" - } - ], - "title": "Running Collectors", - "type": "stat" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Collector uptime", - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Uptime" - }, - "properties": [ - { - "id": "custom.displayMode", - "value": "basic" - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 3, - "interval": "1m", - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": [ - "" - ], - "reducer": [ - "sum" - ], - "show": false - }, - "frameIndex": 0, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Uptime" - } - ] - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "{__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n", - "format": "table", - "instant": true - } - ], - "title": "Collector uptime", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "__name__": true, - "job": true - }, - "includeByName": { - "Value": true, - "cluster": true, - "instance": true, - "namespace": true, - "service_version": true, - "version": true - }, - "indexByName": { - "Value": 4, - "cluster": 0, - "instance": 2, - "namespace": 1, - "service_version": 3, - "version": 3 - }, - "renameByName": { - "Value": "Uptime" - } - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 9 - }, - "id": 4, - "panels": [ ], - "title": "Resources", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "CPU usage", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 5, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n rate(\n {\n __name__=~\"otelcol_process_cpu_seconds(_total)?\",\n job=~\"$job\",\n cluster=~\"$cluster\",\n namespace=~\"$namespace\",\n instance=~\"$instance\"\n }\n [$__rate_interval])\n)\n", - "intervalFactor": 2, - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "CPU usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Memory (RSS)", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/(virtual|resident)/i" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.lineWidth", - "value": 2 - }, - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 10 - ], - "fill": "dash" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 6, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_memory_rss(_bytes)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", - "intervalFactor": 2, - "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "Memory (RSS)", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Memory (Heap Alloc)", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/(virtual|resident)/i" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.lineWidth", - "value": 2 - }, - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 10 - ], - "fill": "dash" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 7, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_runtime_total_sys_memory_bytes(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", - "intervalFactor": 2, - "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "Memory (Heap Alloc)", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 18 - }, - "id": 8, - "panels": [ ], - "title": "Receivers", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted metric points", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 19 - }, - "id": 9, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted metric points", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted log records", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 19 - }, - "id": 10, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted log records", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 19 - }, - "id": 11, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Total incoming items", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 19 - }, - "id": 12, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_incoming_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Total incoming items", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused metric points", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 27 - }, - "id": 13, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused metric points", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused log records", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 27 - }, - "id": 14, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused log records", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 27 - }, - "id": 15, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Total outgoing items", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 27 - }, - "id": 16, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_outgoing_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Total outgoing items", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 35 - }, - "id": 17, - "panels": [ ], - "title": "Processors", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Number of units in the batch", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 36 - }, - "id": 18, - "interval": "1m", - "options": { - "calculate": true, - "calculation": { - "xBuckets": { - "mode": "size", - "value": "1min" - } - }, - "cellGap": 2, - "color": { - "mode": "scheme", - "scheme": "Spectral", - "steps": 128 - }, - "yAxis": { - "decimals": 0, - "unit": "short" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance, le) (increase({__name__=~\"otelcol_processor_batch_batch_send_size_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} - {{le}}" - } - ], - "title": "Number of units in the batch", - "type": "heatmap" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch cardinality", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 36 - }, - "id": 19, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_processor_batch_metadata_cardinality(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Batch cardinality", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Queue current size vs capacity", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 36 - }, - "id": 20, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_size(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue current size" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_capacity(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue capacity" - } - ], - "title": "Queue current size vs capacity", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch size send trigger", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 21, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" - } - ], - "title": "Batch size send trigger", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch timeout send trigger", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 22, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Batch timeout send trigger", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 52 - }, - "id": 23, - "panels": [ ], - "title": "Exporters", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 53 - }, - "id": 24, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Exported metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 53 - }, - "id": 25, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" - } - ], - "title": "Exported logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 53 - }, - "id": 26, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Exported spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 61 - }, - "id": 27, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 61 - }, - "id": 28, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 61 - }, - "id": 29, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 69 - }, - "id": 30, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 69 - }, - "id": 31, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 69 - }, - "id": 32, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed spans", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 77 - }, - "id": 33, - "panels": [ ], - "title": "Network traffic", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound gRPC request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 78 - }, - "id": 34, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound gRPC request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound HTTP request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 78 - }, - "id": 35, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound HTTP request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound gRPC request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 86 - }, - "id": 36, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound gRPC request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound HTTP request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 86 - }, - "id": 37, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound HTTP request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing gRPC request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 94 - }, - "id": 38, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing gRPC request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing HTTP request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 94 - }, - "id": 39, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing HTTP request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing gRPC request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 102 - }, - "id": 40, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing gRPC request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing HTTP request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 102 - }, - "id": 41, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing HTTP request size percentiles", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 39, - "tags": [ - "otelcol" - ], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, - "hide": 0, - "label": "Data source", - "name": "datasource", - "query": "prometheus", - "type": "datasource" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Job", - "multi": true, - "name": "job", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, job)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, cluster)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Namespace", - "multi": true, - "name": "namespace", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, namespace)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Instance", - "multi": true, - "name": "instance", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, instance)", - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timezone": "UTC", - "title": "OpenTelemetry Collector Health", - "uid": "3219d83e205d394f293831f6334ab368" -} diff --git a/opentelemetry-collector-mixin/dashboards_out/collector.json b/opentelemetry-collector-mixin/dashboards_out/collector.json deleted file mode 100644 index 3a2c41ada..000000000 --- a/opentelemetry-collector-mixin/dashboards_out/collector.json +++ /dev/null @@ -1,2096 +0,0 @@ -{ - "description": "A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.", - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 0, - "y": 0 - }, - "id": 1, - "panels": [ ], - "title": "Overview", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Running Collectors", - "fieldConfig": { - "defaults": { - "unit": "none" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "count({__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n" - } - ], - "title": "Running Collectors", - "type": "stat" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Collector uptime", - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Uptime" - }, - "properties": [ - { - "id": "custom.displayMode", - "value": "basic" - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 3, - "interval": "1m", - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": [ - "" - ], - "reducer": [ - "sum" - ], - "show": false - }, - "frameIndex": 0, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Uptime" - } - ] - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "{__name__=~\"otelcol_process_uptime(_seconds_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n", - "format": "table", - "instant": true - } - ], - "title": "Collector uptime", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "__name__": true, - "job": true - }, - "includeByName": { - "Value": true, - "cluster": true, - "instance": true, - "namespace": true, - "service_version": true, - "version": true - }, - "indexByName": { - "Value": 4, - "cluster": 0, - "instance": 2, - "namespace": 1, - "service_version": 3, - "version": 3 - }, - "renameByName": { - "Value": "Uptime" - } - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 9 - }, - "id": 4, - "panels": [ ], - "title": "Resources", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "CPU usage", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 5, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n rate(\n {\n __name__=~\"otelcol_process_cpu_seconds(_total)?\",\n job=~\"$job\",\n cluster=~\"$cluster\",\n namespace=~\"$namespace\",\n instance=~\"$instance\"\n }\n [$__rate_interval])\n)\n", - "intervalFactor": 2, - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "CPU usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Memory (RSS)", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/(virtual|resident)/i" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.lineWidth", - "value": 2 - }, - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 10 - ], - "fill": "dash" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 6, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_memory_rss(_bytes)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", - "intervalFactor": 2, - "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "Memory (RSS)", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Memory (Heap Alloc)", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/(virtual|resident)/i" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.lineWidth", - "value": 2 - }, - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 10 - ], - "fill": "dash" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 7, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance) (\n {__name__=~\"otelcol_process_runtime_total_sys_memory_bytes(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}\n)\n", - "intervalFactor": 2, - "legendFormat": "RSS - {{cluster}} - {{namespace}} - {{instance}}\n" - } - ], - "title": "Memory (Heap Alloc)", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 18 - }, - "id": 8, - "panels": [ ], - "title": "Receivers", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted metric points", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 19 - }, - "id": 9, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted metric points", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted log records", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 19 - }, - "id": 10, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted log records", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Accepted spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 19 - }, - "id": 11, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_accepted_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Accepted spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Total incoming items", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 19 - }, - "id": 12, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_incoming_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Total incoming items", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused metric points", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 27 - }, - "id": 13, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused metric points", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused log records", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 27 - }, - "id": 14, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused log records", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Refused spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 27 - }, - "id": 15, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_receiver_refused_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Refused spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Total outgoing items", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 27 - }, - "id": 16, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_outgoing_items(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Total outgoing items", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 35 - }, - "id": 17, - "panels": [ ], - "title": "Processors", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Number of units in the batch", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 36 - }, - "id": 18, - "interval": "1m", - "options": { - "calculate": true, - "calculation": { - "xBuckets": { - "mode": "size", - "value": "1min" - } - }, - "cellGap": 2, - "color": { - "mode": "scheme", - "scheme": "Spectral", - "steps": 128 - }, - "yAxis": { - "decimals": 0, - "unit": "short" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by (job, cluster, namespace, instance, le) (increase({__name__=~\"otelcol_processor_batch_batch_send_size_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} - {{le}}" - } - ], - "title": "Number of units in the batch", - "type": "heatmap" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch cardinality", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 36 - }, - "id": 19, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_processor_batch_metadata_cardinality(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Batch cardinality", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Queue current size vs capacity", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 36 - }, - "id": 20, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_size(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue current size" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) ({__name__=~\"otelcol_exporter_queue_capacity(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"})\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}} queue capacity" - } - ], - "title": "Queue current size vs capacity", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch size send trigger", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 21, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" - } - ], - "title": "Batch size send trigger", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Batch timeout send trigger", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 22, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_processor_batch_timeout_trigger_send(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Batch timeout send trigger", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 52 - }, - "id": 23, - "panels": [ ], - "title": "Exporters", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 53 - }, - "id": 24, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Exported metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 53 - }, - "id": 25, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n" - } - ], - "title": "Exported logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Exported spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 53 - }, - "id": 26, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_sent_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Exported spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 61 - }, - "id": 27, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 61 - }, - "id": 28, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Failed spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 61 - }, - "id": 29, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_send_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Failed spans", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed metrics", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 69 - }, - "id": 30, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_metric_points(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed metrics", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed logs", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 69 - }, - "id": 31, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_log_records(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed logs", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Enqueue failed spans", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "showPoints": "never" - }, - "decimals": 0, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 69 - }, - "id": 32, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "sum by(job, cluster, namespace, instance) (rate({__name__=~\"otelcol_exporter_enqueue_failed_spans(_total)?\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))\n", - "legendFormat": "{{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Enqueue failed spans", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 0, - "x": 24, - "y": 77 - }, - "id": 33, - "panels": [ ], - "title": "Network traffic", - "type": "row" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound gRPC request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 78 - }, - "id": 34, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound gRPC request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound HTTP request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 78 - }, - "id": 35, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound HTTP request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound gRPC request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 86 - }, - "id": 36, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_server_request_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound gRPC request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Inbound HTTP request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 86 - }, - "id": 37, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_server_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Inbound HTTP request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing gRPC request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 94 - }, - "id": 38, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_duration_milliseconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing gRPC request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing HTTP request duration percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 94 - }, - "id": 39, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_duration_seconds_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing HTTP request duration percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing gRPC request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 102 - }, - "id": 40, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"rpc_client_request_size(_bytes_?)_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing gRPC request size percentiles", - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "description": "Outgoing HTTP request size percentiles", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 10, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "never" - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 102 - }, - "id": 41, - "interval": "1m", - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table" - } - }, - "pluginVersion": "v11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p50 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p90 - {{cluster}} - {{namespace}} - {{instance}}" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$datasource" - }, - "expr": "histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~\"http_client_request_body_size_bytes_bucket\", job=~\"$job\", cluster=~\"$cluster\", namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))\n", - "legendFormat": "p99 - {{cluster}} - {{namespace}} - {{instance}}" - } - ], - "title": "Outgoing HTTP request size percentiles", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 39, - "tags": [ - "otelcol" - ], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, - "hide": 0, - "label": "Data source", - "name": "datasource", - "query": "prometheus", - "type": "datasource" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Job", - "multi": true, - "name": "job", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, job)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, cluster)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Namespace", - "multi": true, - "name": "namespace", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, namespace)", - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "includeAll": true, - "label": "Instance", - "multi": true, - "name": "instance", - "query": "label_values({__name__=~\"otelcol_process_uptime.*\"}, instance)", - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timezone": "UTC", - "title": "OpenTelemetry Collector Health", - "uid": "3219d83e205d394f293831f6334ab368" -} From 50135dcc0a09f7f17c2d4b4f3083d645e2e321d1 Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 14:09:09 +0300 Subject: [PATCH 5/8] Remove committed alerts and rules Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/prometheus_alerts.yaml | 12 ------------ opentelemetry-collector-mixin/prometheus_rules.yaml | 3 --- 2 files changed, 15 deletions(-) delete mode 100644 opentelemetry-collector-mixin/prometheus_alerts.yaml delete mode 100644 opentelemetry-collector-mixin/prometheus_rules.yaml diff --git a/opentelemetry-collector-mixin/prometheus_alerts.yaml b/opentelemetry-collector-mixin/prometheus_alerts.yaml deleted file mode 100644 index b4cab2282..000000000 --- a/opentelemetry-collector-mixin/prometheus_alerts.yaml +++ /dev/null @@ -1,12 +0,0 @@ -"groups": -- "name": "otelcol" - "rules": - - "alert": "OtelcolSendingQueueFull" - "annotations": - "description": "The sending queue is full; the collector might start dropping data" - "summary": "The sending queue has filled up" - "expr": | - otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity - "for": "30m" - "labels": - "severity": "warning" diff --git a/opentelemetry-collector-mixin/prometheus_rules.yaml b/opentelemetry-collector-mixin/prometheus_rules.yaml deleted file mode 100644 index bfeea2506..000000000 --- a/opentelemetry-collector-mixin/prometheus_rules.yaml +++ /dev/null @@ -1,3 +0,0 @@ -"groups": -- "name": "otelcol-rules" - "rules": [] From 430bb917394a7b114b812c85754702935ab461a1 Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 14:12:57 +0300 Subject: [PATCH 6/8] Add .lint file Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/.lint | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 opentelemetry-collector-mixin/.lint diff --git a/opentelemetry-collector-mixin/.lint b/opentelemetry-collector-mixin/.lint new file mode 100644 index 000000000..bc18d39f7 --- /dev/null +++ b/opentelemetry-collector-mixin/.lint @@ -0,0 +1,7 @@ +exclusions: + template-job-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + template-instance-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + panel-datasource-rule: + reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option. From fe6098c46c13bff44a7a1bcfaeab5bde7ad40e28 Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 14:18:21 +0300 Subject: [PATCH 7/8] Fix alerts Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opentelemetry-collector-mixin/alerts/alerts.libsonnet b/opentelemetry-collector-mixin/alerts/alerts.libsonnet index e0fd5f534..90a897be2 100644 --- a/opentelemetry-collector-mixin/alerts/alerts.libsonnet +++ b/opentelemetry-collector-mixin/alerts/alerts.libsonnet @@ -14,8 +14,8 @@ severity: 'warning', }, annotations: { - summary: 'The sending queue has filled up', - description: 'The sending queue is full; the collector might start dropping data', + summary: 'The sending queue has filled up.', + description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data', }, }, ], From 3f1a169bf9c4f8c03d62c874f1147333fece9b73 Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Thu, 17 Jul 2025 14:31:18 +0300 Subject: [PATCH 8/8] Add UID in another way Signed-off-by: Paschalis Tsilias --- opentelemetry-collector-mixin/dashboards.jsonnet | 1 - opentelemetry-collector-mixin/dashboards/collector.libsonnet | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/opentelemetry-collector-mixin/dashboards.jsonnet b/opentelemetry-collector-mixin/dashboards.jsonnet index b7d3e1e95..c0d94ffa6 100644 --- a/opentelemetry-collector-mixin/dashboards.jsonnet +++ b/opentelemetry-collector-mixin/dashboards.jsonnet @@ -3,7 +3,6 @@ local cfg = import 'config.libsonnet'; { [name]: dashboards[name] { - uid: std.get(cfg._config.grafanaDashboardIDs, name, default=std.md5(name)), timezone: cfg._config.grafana.grafanaTimezone, refresh: cfg._config.grafana.refresh, tags: cfg._config.grafana.dashboardTags, diff --git a/opentelemetry-collector-mixin/dashboards/collector.libsonnet b/opentelemetry-collector-mixin/dashboards/collector.libsonnet index 601eb3957..14ec1dfc0 100644 --- a/opentelemetry-collector-mixin/dashboards/collector.libsonnet +++ b/opentelemetry-collector-mixin/dashboards/collector.libsonnet @@ -3,6 +3,7 @@ local row = g.panel.row; local variables = import './utils/variables.libsonnet'; local panels = import './utils/panels.libsonnet'; local queries = import './utils/queries.libsonnet'; +local cfg = import '../config.libsonnet'; { grafanaDashboards+:: { @@ -19,6 +20,7 @@ local queries = import './utils/queries.libsonnet'; variables.namespaceVariable, variables.instanceVariable, ]) + + g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json']) + g.dashboard.withPanels( g.util.grid.wrapPanels([ // Overview row