grafana · schmikei · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Nov 13, 2025
@@ -18,7 +18,7 @@ and the following alerts:
 - CouchDBReplicatorJobsCrashing
 - CouchDBReplicatorChangesQueuesDying
 - CouchDBReplicatorConnectionOwnersCrashing
-- CouchDBReplicatorConnectionWorkersCrashing
+- CouchDBReplicatorWorkersCrashing
 
 ## Apache CouchDB Overview
 
@@ -68,8 +68,8 @@ scrape_configs:
 - CouchDBManyReplicatorJobsPending: There is a high number of replicator jobs pending for a node.
 - CouchDBReplicatorJobsCrashing: There are replicator jobs crashing for a node.
 - CouchDBReplicatorChangesQueuesDying: There are replicator changes queue process deaths for a node.
-- CouchDBReplicatorConnectionOwnersCrashing: There are replicator connection owner process crashes for a node.
-- CouchDBReplicatorConnectionWorkersCrashing: There are replicator connection worker process crashes for a node.
+- CouchDBReplicatorOwnersCrashing: There are replicator connection owner process crashes for a node.
+- CouchDBReplicatorWorkersCrashing: There are replicator connection worker process crashes for a node.
 
 ## Install tools
 

@@ -1,14 +1,14 @@
 {
-  prometheusAlerts+:: {
+  new(this): {
     groups+: [
       {
         name: 'ApacheCouchDBAlerts',
         rules: [
           {
             alert: 'CouchDBUnhealthyCluster',
             expr: |||
-              min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s
-            ||| % $._config,
+              min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -19,14 +19,14 @@
                 (
                   '{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' +
                   'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHigh4xxResponseCodes',
             expr: |||
-              sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -37,14 +37,14 @@
                 (
                   '{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHigh5xxResponseCodes',
             expr: |||
-              sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -55,14 +55,14 @@
                 (
                   '{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBModerateRequestLatency',
             expr: |||
-              sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s
-            ||| % $._config,
+              sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsWarningRequestLatency5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -73,14 +73,14 @@
                 (
                   'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningRequestLatency5m)sms. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHighRequestLatency',
             expr: |||
-              sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s
-            ||| % $._config,
+              sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsCriticalRequestLatency5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -91,14 +91,14 @@
                 (
                   'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBManyReplicatorJobsPending',
             expr: |||
-              sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s
-            ||| % $._config,
+              sum by(job, instance) (couchdb_couch_replicator_jobs_pending{%(filteringSelector)s}) > %(alertsWarningPendingReplicatorJobs5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -109,14 +109,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBReplicatorJobsCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total{%(filteringSelector)s}[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -127,14 +127,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBReplicatorChangesQueuesDying',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total{%(filteringSelector)s}[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -145,14 +145,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
-            alert: 'CouchDBReplicatorConnectionOwnersCrashing',
+            alert: 'CouchDBReplicatorOwnersCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -163,14 +163,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
-            alert: 'CouchDBReplicatorConnectionWorkersCrashing',
+            alert: 'CouchDBReplicatorWorkersCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -181,7 +181,7 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
         ],

@@ -1,26 +1,49 @@
 {
-  _config+:: {
-    enableMultiCluster: false,
-    couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
-    multiClusterSelector: 'job=~"$job"',
+  local this = self,
+  filteringSelector: 'job="integrations/apache-couchdb"',
+  groupLabels: ['job', 'couchdb_cluster', 'cluster'],
+  logLabels: ['job', 'cluster', 'instance'],
+  instanceLabels: ['instance'],
 
-    dashboardTags: ['apache-couchdb-mixin'],
-    dashboardPeriod: 'now-1h',
-    dashboardTimezone: 'default',
-    dashboardRefresh: '1m',
+  dashboardTags: ['apache-couchdb-mixin'],
+  uid: 'couchdb',
+  dashboardNamePrefix: 'Apache CouchDB',
+  dashboardPeriod: 'now-1h',
+  dashboardTimezone: 'default',
+  dashboardRefresh: '1m',
+  metricsSource: [
+    'prometheus',
+    /*
+    * the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin.
+    * i.e. couchdb_open_os_files_total => couchdb_open_os_files
+    * This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected.
+    * This was an identified as a noticeable change from 3.3.0 to 3.5.0
+    */
+    'prometheusWithTotal',
+  ],
 
-    //alert thresholds
-    alertsCriticalClusterIsUnstable5m: 1,  //1 is stable
-    alertsWarning4xxResponseCodes5m: 5,
-    alertsCritical5xxResponseCodes5m: 0,
-    alertsWarningRequestLatency5m: 500,  //ms
-    alertsCriticalRequestLatency5m: 1000,  //ms
-    alertsWarningPendingReplicatorJobs5m: 10,
-    alertsCriticalCrashingReplicatorJobs5m: 0,
-    alertsWarningDyingReplicatorChangesQueues5m: 0,
-    alertsWarningCrashingReplicatorConnectionOwners5m: 0,
-    alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
+  // Logging configuration
+  enableLokiLogs: true,
+  extraLogLabels: ['level'],
+  logsVolumeGroupBy: 'level',
+  showLogsVolume: true,
 
-    enableLokiLogs: true,
+  //alert thresholds
+  alertsCriticalClusterIsUnstable5m: 1,  //1 is stable
+  alertsWarning4xxResponseCodes5m: 5,
+  alertsCritical5xxResponseCodes5m: 0,
+  alertsWarningRequestLatency5m: 500,  //ms
+  alertsCriticalRequestLatency5m: 1000,  //ms
+  alertsWarningPendingReplicatorJobs5m: 10,
+  alertsCriticalCrashingReplicatorJobs5m: 0,
+  alertsWarningDyingReplicatorChangesQueues5m: 0,
+  alertsWarningCrashingReplicatorConnectionOwners5m: 0,
+  alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
+
+  // Signals configuration
+  signals+: {
+    overview: (import './signals/overview.libsonnet')(this),
+    nodes: (import './signals/nodes.libsonnet')(this),
+    replicator: (import './signals/replicator.libsonnet')(this),
   },
 }
@@ -0,0 +1,107 @@
+local g = import './g.libsonnet';
+local logslib = import 'logs-lib/logs/main.libsonnet';
+
+{
+  local root = self,
+
+  new(this)::
+    local prefix = this.config.dashboardNamePrefix;
+    local links = this.grafana.links;
+    local tags = this.config.dashboardTags;
+    local uid = g.util.string.slugify(this.config.uid);
+    local vars = this.grafana.variables;
+    local annotations = this.grafana.annotations;
+    local refresh = this.config.dashboardRefresh;
+    local period = this.config.dashboardPeriod;
+    local timezone = this.config.dashboardTimezone;
+    {
+      'couchdb-overview.json':
+        g.dashboard.new(prefix + ' overview')
+        + g.dashboard.withPanels(
+          g.util.panel.resolveCollapsedFlagOnRows(
+            g.util.grid.wrapPanels(
+              [
+                this.grafana.rows.overview,
+                this.grafana.rows.overviewRequests,
+                this.grafana.rows.overviewReplication,
+              ]
+            )
+          )
+        ) + root.applyCommon(
+          vars.multiInstance,
+          uid + '_overview',
+          tags,
+          links { couchdbOverview+:: {} },
+          annotations,
+          timezone,
+          refresh,
+          period
+        ),
+
+      'couchdb-nodes.json':
+        g.dashboard.new(prefix + ' nodes')
+        + g.dashboard.withPanels(
+          g.util.panel.resolveCollapsedFlagOnRows(
+            g.util.grid.wrapPanels(
+              [
+                this.grafana.rows.nodes,
+                this.grafana.rows.nodeRequests,
+                this.grafana.rows.nodeLogs,
+              ],
+            ),
+          ),
+        ) + root.applyCommon(
+          vars.multiInstance,
+          uid + '_nodes',
+          tags,
+          links { couchdbNodes+:: {} },
+          annotations,
+          timezone,
+          refresh,
+          period
+        ),
+
+    }
+    + if this.config.enableLokiLogs then {
+      'couchdb-logs.json':
+        logslib.new(
+          prefix + ' logs',
+          datasourceName=this.grafana.variables.datasources.loki.name,
+          datasourceRegex=this.grafana.variables.datasources.loki.regex,
+          filterSelector=this.config.filteringSelector,
+          labels=this.config.groupLabels + this.config.extraLogLabels,
+          formatParser=null,
+          showLogsVolume=this.config.showLogsVolume,
+        )
+        {
+          dashboards+:
+            {
+              logs+:
+                root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { couchdbLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
+            },
+          panels+:
+            {
+              logs+:
+                g.panel.logs.options.withEnableLogDetails(true)
+                + g.panel.logs.options.withShowTime(false)
+                + g.panel.logs.options.withWrapLogMessage(false),
+            },
+          variables+: {
+            toArray+: [
+              this.grafana.variables.datasources.prometheus { hide: 2 },
+            ],
+          },
+        }.dashboards.logs,
+    }
+    else {},
+
+  applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
+    g.dashboard.withTags(tags)
+    + g.dashboard.withUid(uid)
+    + g.dashboard.withLinks(std.objectValues(links))
+    + g.dashboard.withTimezone(timezone)
+    + g.dashboard.withRefresh(refresh)
+    + g.dashboard.time.withFrom(period)
+    + g.dashboard.withVariables(vars)
+    + g.dashboard.withAnnotations(std.objectValues(annotations)),
+}