grafana · schmikei · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Nov 13, 2025
@@ -18,7 +18,7 @@ and the following alerts:
 - CouchDBReplicatorJobsCrashing
 - CouchDBReplicatorChangesQueuesDying
 - CouchDBReplicatorConnectionOwnersCrashing
-- CouchDBReplicatorConnectionWorkersCrashing
+- CouchDBReplicatorWorkersCrashing
 
 ## Apache CouchDB Overview
 
@@ -58,6 +58,35 @@ scrape_configs:
           __path__: /var/log/couchdb/couchdb.log
 ```
 
+## CouchDB Version Compatibility
+
+This mixin supports **Apache CouchDB 3.3.1 and later** and handles differences in metric naming conventions between versions.
+
+### Metric Naming Changes
+
+Between CouchDB 3.3.0 and 3.5.0, there was a change in how some metrics are named. Specifically, some metrics that previously had a `_total` suffix no longer include it in newer versions:
+
+- **CouchDB 3.3.0 and earlier**: `couchdb_open_os_files_total`
+- **CouchDB 3.5.0 and later**: `couchdb_open_os_files`
+
+### How the Mixin Handles This
+
+By default, the mixin is configured to work with both naming conventions automatically through the `metricsSource` configuration in `config.libsonnet`. This ensures dashboards and alerts work correctly regardless of which CouchDB version you're running.
+
+If you need to customize this behavior, you can modify the `metricsSource` in your `config.libsonnet`:
+
+```jsonnet
+{
+  _config+:: {
+    // For CouchDB 3.5.0+ only (no _total suffix)
+    metricsSource: ['prometheus'],
+
+    // OR for backwards compatibility with both versions
+    metricsSource: ['prometheus', 'prometheusWithTotal'],
+  },
+}
+```
+
 ## Alerts Overview
 
 - CouchDBUnhealthyCluster: At least one of the nodes in a cluster is reporting the cluster as being unstable.
@@ -68,8 +97,8 @@ scrape_configs:
 - CouchDBManyReplicatorJobsPending: There is a high number of replicator jobs pending for a node.
 - CouchDBReplicatorJobsCrashing: There are replicator jobs crashing for a node.
 - CouchDBReplicatorChangesQueuesDying: There are replicator changes queue process deaths for a node.
-- CouchDBReplicatorConnectionOwnersCrashing: There are replicator connection owner process crashes for a node.
-- CouchDBReplicatorConnectionWorkersCrashing: There are replicator connection worker process crashes for a node.
+- CouchDBReplicatorOwnersCrashing: There are replicator connection owner process crashes for a node.
+- CouchDBReplicatorWorkersCrashing: There are replicator connection worker process crashes for a node.
 
 ## Install tools
 

@@ -1,14 +1,14 @@
 {
-  prometheusAlerts+:: {
+  new(this): {
     groups+: [
       {
         name: 'ApacheCouchDBAlerts',
         rules: [
           {
             alert: 'CouchDBUnhealthyCluster',
             expr: |||
-              min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s
-            ||| % $._config,
+              min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -19,14 +19,14 @@
                 (
                   '{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' +
                   'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHigh4xxResponseCodes',
             expr: |||
-              sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -37,14 +37,14 @@
                 (
                   '{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHigh5xxResponseCodes',
             expr: |||
-              sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -55,14 +55,14 @@
                 (
                   '{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.'
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBModerateRequestLatency',
             expr: |||
-              sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s
-            ||| % $._config,
+              sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsWarningRequestLatency5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -73,14 +73,14 @@
                 (
                   'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningRequestLatency5m)sms. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBHighRequestLatency',
             expr: |||
-              sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s
-            ||| % $._config,
+              sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsCriticalRequestLatency5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -91,14 +91,14 @@
                 (
                   'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBManyReplicatorJobsPending',
             expr: |||
-              sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s
-            ||| % $._config,
+              sum by(job, instance) (couchdb_couch_replicator_jobs_pending{%(filteringSelector)s}) > %(alertsWarningPendingReplicatorJobs5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -109,14 +109,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBReplicatorJobsCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total{%(filteringSelector)s}[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'critical',
@@ -127,14 +127,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
             alert: 'CouchDBReplicatorChangesQueuesDying',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total{%(filteringSelector)s}[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -145,14 +145,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
-            alert: 'CouchDBReplicatorConnectionOwnersCrashing',
+            alert: 'CouchDBReplicatorOwnersCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -163,14 +163,14 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
           {
-            alert: 'CouchDBReplicatorConnectionWorkersCrashing',
+            alert: 'CouchDBReplicatorWorkersCrashing',
             expr: |||
-              sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
-            ||| % $._config,
+              sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
+            ||| % this.config,
             'for': '5m',
             labels: {
               severity: 'warning',
@@ -181,7 +181,7 @@
                 (
                   '{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
                   'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. '
-                ) % $._config,
+                ) % this.config,
             },
           },
         ],

@@ -1,26 +1,49 @@
 {
-  _config+:: {
-    enableMultiCluster: false,
-    couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
-    multiClusterSelector: 'job=~"$job"',
+  local this = self,
+  filteringSelector: 'job="integrations/apache-couchdb"',
+  groupLabels: ['job', 'couchdb_cluster', 'cluster'],
+  logLabels: ['job', 'cluster', 'instance'],
+  instanceLabels: ['instance'],
 
-    dashboardTags: ['apache-couchdb-mixin'],
-    dashboardPeriod: 'now-1h',
-    dashboardTimezone: 'default',
-    dashboardRefresh: '1m',
+  dashboardTags: ['apache-couchdb-mixin'],
+  uid: 'couchdb',
+  dashboardNamePrefix: 'Apache CouchDB',
+  dashboardPeriod: 'now-1h',
+  dashboardTimezone: 'default',
+  dashboardRefresh: '1m',
+  metricsSource: [
+    'prometheus',
+    /*
+    * the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin.
+    * i.e. couchdb_open_os_files_total => couchdb_open_os_files
+    * This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected.
+    * This was an identified as a noticeable change from 3.3.0 to 3.5.0
+    */
+    'prometheusWithTotal',
+  ],
 
-    //alert thresholds
-    alertsCriticalClusterIsUnstable5m: 1,  //1 is stable
-    alertsWarning4xxResponseCodes5m: 5,
-    alertsCritical5xxResponseCodes5m: 0,
-    alertsWarningRequestLatency5m: 500,  //ms
-    alertsCriticalRequestLatency5m: 1000,  //ms
-    alertsWarningPendingReplicatorJobs5m: 10,
-    alertsCriticalCrashingReplicatorJobs5m: 0,
-    alertsWarningDyingReplicatorChangesQueues5m: 0,
-    alertsWarningCrashingReplicatorConnectionOwners5m: 0,
-    alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
+  // Logging configuration
+  enableLokiLogs: true,
+  extraLogLabels: ['level'],
+  logsVolumeGroupBy: 'level',
+  showLogsVolume: true,
 
-    enableLokiLogs: true,
+  //alert thresholds
+  alertsCriticalClusterIsUnstable5m: 1,  //1 is stable
+  alertsWarning4xxResponseCodes5m: 5,
+  alertsCritical5xxResponseCodes5m: 0,
+  alertsWarningRequestLatency5m: 500,  //ms
+  alertsCriticalRequestLatency5m: 1000,  //ms
+  alertsWarningPendingReplicatorJobs5m: 10,
+  alertsCriticalCrashingReplicatorJobs5m: 0,
+  alertsWarningDyingReplicatorChangesQueues5m: 0,
+  alertsWarningCrashingReplicatorConnectionOwners5m: 0,
+  alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
+
+  // Signals configuration
+  signals+: {
+    overview: (import './signals/overview.libsonnet')(this),
+    nodes: (import './signals/nodes.libsonnet')(this),
+    replicator: (import './signals/replicator.libsonnet')(this),
   },
 }