grafana · tpaschalis · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
@@ -0,0 +1,7 @@
+exclusions:
+ template-job-rule:
+   reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments
+ template-instance-rule:
+   reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments
+ panel-datasource-rule:
+  reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option.
@@ -0,0 +1 @@
+include ../Makefile_mixin
@@ -0,0 +1,115 @@
+# opentelemetry-collector-mixin
+
+Prometheus Monitoring Mixin for the OpenTelemetry Collector
+
+This mixin contains a set of Prometheus alert rules and Grafana dashboards
+based on the metrics exported by the OpenTelemetry Collector's [internal
+telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/).
+
+To use it, you need to have `jsonnet` (any sufficiently modern version should
+do, but ideally v0.20+) and `jb` installed.
+
+If you have a working Go development environment, you can run the following to
+get started:
+```
+go install github.com/google/go-jsonnet/cmd/jsonnet@latest
+go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
+```
+
+### Usage
+
+First, install the dependencies by running the following command from the repo
+root:
+```
+$ jb install
+```
+
+You can then build the Prometheus alert and recording rules by running:
+```
+$ make prometheus_alerts.yaml
+$ make prometheus_rules.yaml
+```
+
+You can also render a JSON dashboard file for Grafana by running the following
+command. The results are stored in the `dashboards_out/` directory.
+```
+$ make dashboards_out
+```
+
+### OpenTelemetry Collector configuration
+
+By default, the OpenTelemetry Collector exposes its [internal
+telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/) as
+prometheus metrics on port 8888.
+
+The following configuration can be used as a starting point for scraping and
+sending metrics in a Prometheus-compatible store.
+
+```yaml
+extensions:
+  basicauth/remote_write:
+    client_auth:
+      username: "username"
+      password: "password"
+
+receivers:
+    prometheus:
+      config:
+        scrape_configs:
+          - job_name: 'otel-collector'
+            scrape_interval: 15s
+            static_configs:
+              - targets: ['0.0.0.0:8888']
+
+processors:
+  batch:
+
+exporters:
+  prometheusremotewrite:
+    endpoint: "http://prometheus/api/prom/push"
+    auth:
+      authenticator: basicauth/remote_write
+    resource_to_telemetry_conversion:
+      enabled: true # Convert resource attributes to metric labels
+
+service:
+  telemetry:
+    metrics:
+      level: "detailed"
+      readers:
+        - pull:
+            exporter:
+              prometheus:
+                host: '0.0.0.0'
+                port: 8888
+  extensions: [basicauth/remote_write]
+  pipelines:
+    metrics:
+      receivers: [prometheus]
+      processors: [batch]
+      exporters: [prometheusremotewrite]
+```
+
+### Other requirements
+
+The Makefile contains commands for formatting, linting and testing the mixin.
+For development purposes you may need one or more of the following as well.
+```
+go install github.com/google/go-jsonnet/cmd/jsonnet-lint@latest
+go install github.com/grafana/dashboard-linter@latest
+go install github.com/prometheus/prometheus/cmd/promtool@latest
+go install github.com/monitoring-mixins/mixtool/cmd/mixtool@main
+```
+
+### Contributing
+
+To contribute:
+
+1. Fork the repository
+2. Make your changes
+3. Run `make all` to verify your changes and test in a Prometheus/Grafana environment. Screenshots are welcome for new panels/dashboards.
+4. Submit a pull request
+
+If you want to make some parameter configurable, use `config.libsonnet` as an
+entrypoint.
+
@@ -0,0 +1 @@
+std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
@@ -0,0 +1,25 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'otelcol',
+        rules: [
+          {
+            alert: 'OtelcolSendingQueueFull',
+            expr: |||
+              otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity
+            |||,
+            'for': '30m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'The sending queue has filled up.',
+              description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
@@ -0,0 +1,23 @@
+{
+  _config+:: {
+    // Grafana dashboard IDs are necessary for stable links for dashboards
+    grafanaDashboardIDs: {
+      'collector.json': std.md5('collector.json'),
+    },
+
+    // Config for the Grafana dashboards in the Kubernetes Mixin
+    grafana: {
+      // The default refresh time for all dashboards, default to 10s
+      refresh: '10s',
+
+      // Timezone for Grafana dashboards:: UTC, browser, ...
+      grafanaTimezone: 'UTC',
+
+      // Tags for Grafana dashboards
+      dashboardTags: ['otelcol'],
+    },
+
+    // Default datasource name
+    datasourceName: 'default',
+  },
+}
@@ -0,0 +1,11 @@
+local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
+local cfg = import 'config.libsonnet';
+
+{
+  [name]: dashboards[name] {
+    timezone: cfg._config.grafana.grafanaTimezone,
+    refresh: cfg._config.grafana.refresh,
+    tags: cfg._config.grafana.dashboardTags,
+  }
+  for name in std.objectFields(dashboards)
+}
@@ -0,0 +1,137 @@
+local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
+local row = g.panel.row;
+local variables = import './utils/variables.libsonnet';
+local panels = import './utils/panels.libsonnet';
+local queries = import './utils/queries.libsonnet';
+local cfg = import '../config.libsonnet';
+
+{
+  grafanaDashboards+:: {
+    'collector.json':
+      g.dashboard.new(
+        'OpenTelemetry Collector Health',
+      )
+      + g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.')
+      + g.dashboard.graphTooltip.withSharedCrosshair()
+      + g.dashboard.withVariables([
+        variables.datasourceVariable,
+        variables.jobVariable,
+        variables.clusterVariable,
+        variables.namespaceVariable,
+        variables.instanceVariable,
+      ])
+      + g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json'])
+      + g.dashboard.withPanels(
+        g.util.grid.wrapPanels([
+          // Overview row
+          row.new('Overview'),
+          panels.stat.base('Running Collectors', [queries.runningCollectors]),
+          panels.table.uptime('Collector uptime', [queries.collectorUptime]),
+
+          // Resources row
+          row.new('Resources'),
+          panels.timeSeries.cpuUsage('CPU usage', [queries.cpuUsage])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.memoryUsage('Memory (RSS)', queries.memUsageRSS)
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.memoryUsage('Memory (Heap Alloc)', queries.memUsageHeapAlloc)
+          + { gridPos: { w: 8 } },
+
+          // Receivers row
+          row.new('Receivers'),
+          panels.timeSeries.short('Accepted metric points', [queries.acceptedMetricPoints])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Accepted log records', [queries.acceptedLogRecords])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Accepted spans', [queries.acceptedSpans])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Total incoming items', [queries.incomingItems])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Refused metric points', [queries.refusedMetricPoints])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Refused log records', [queries.refusedLogRecords])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Refused spans', [queries.refusedSpans])
+          + { gridPos: { w: 6 } },
+          panels.timeSeries.short('Total outgoing items', [queries.outgoingItems])
+          + { gridPos: { w: 6 } },
+
+          // Processors row
+          row.new('Processors'),
+          panels.heatmap.base('Number of units in the batch', [queries.batchSendSize])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Batch cardinality', [queries.batchCardinality])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Queue current size vs capacity', [queries.queueSize, queries.queueCapacity])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Batch size send trigger', [queries.batchSizeSendTrigger]),
+          panels.timeSeries.short('Batch timeout send trigger', [queries.batchTimeoutSendTrigger]),
+
+          // Exporters row
+          row.new('Exporters'),
+          panels.timeSeries.short('Exported metrics', [queries.exportedMetrics])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Exported logs', [queries.exportedLogs])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Exported spans', [queries.exportedSpans])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Failed metrics', [queries.failedMetrics])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Failed logs', [queries.failedLogs])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Failed spans', [queries.failedSpans])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Enqueue failed metrics', [queries.enqueueFailedMetrics])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Enqueue failed logs', [queries.enqueueFailedLogs])
+          + { gridPos: { w: 8 } },
+          panels.timeSeries.short('Enqueue failed spans', [queries.enqueueFailedSpans])
+          + { gridPos: { w: 8 } },
+
+          // Network traffic row
+          row.new('Network traffic'),
+          panels.timeSeries.seconds('Inbound gRPC request duration percentiles', [
+            queries.grpcInboundDurationP50,
+            queries.grpcInboundDurationP90,
+            queries.grpcInboundDurationP99,
+          ]),
+          panels.timeSeries.seconds('Inbound HTTP request duration percentiles', [
+            queries.httpInboundDurationP50,
+            queries.httpInboundDurationP90,
+            queries.httpInboundDurationP99,
+          ]),
+          panels.timeSeries.bytes('Inbound gRPC request size percentiles', [
+            queries.grpcInboundSizeP50,
+            queries.grpcInboundSizeP90,
+            queries.grpcInboundSizeP99,
+          ]),
+          panels.timeSeries.bytes('Inbound HTTP request size percentiles', [
+            queries.httpInboundSizeP50,
+            queries.httpInboundSizeP90,
+            queries.httpInboundSizeP99,
+          ]),
+          panels.timeSeries.seconds('Outgoing gRPC request duration percentiles', [
+            queries.grpcOutboundDurationP50,
+            queries.grpcOutboundDurationP90,
+            queries.grpcOutboundDurationP99,
+          ]),
+          panels.timeSeries.seconds('Outgoing HTTP request duration percentiles', [
+            queries.httpOutboundDurationP50,
+            queries.httpOutboundDurationP90,
+            queries.httpOutboundDurationP99,
+          ]),
+          panels.timeSeries.bytes('Outgoing gRPC request size percentiles', [
+            queries.grpcOutboundSizeP50,
+            queries.grpcOutboundSizeP90,
+            queries.grpcOutboundSizeP99,
+          ]),
+          panels.timeSeries.bytes('Outgoing HTTP request size percentiles', [
+            queries.httpOutboundSizeP50,
+            queries.httpOutboundSizeP90,
+            queries.httpOutboundSizeP99,
+          ]),
+
+        ], panelWidth=12, panelHeight=8),
+      ),
+  },
+}
@@ -0,0 +1 @@
+(import 'collector.libsonnet')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)