Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions apache-couchdb-mixin/README.md
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need to call out the version support here like I mentioned in the config file :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Called out in the readme :)

Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ and the following alerts:
- CouchDBReplicatorJobsCrashing
- CouchDBReplicatorChangesQueuesDying
- CouchDBReplicatorConnectionOwnersCrashing
- CouchDBReplicatorConnectionWorkersCrashing
- CouchDBReplicatorWorkersCrashing

## Apache CouchDB Overview

Expand Down Expand Up @@ -68,8 +68,8 @@ scrape_configs:
- CouchDBManyReplicatorJobsPending: There is a high number of replicator jobs pending for a node.
- CouchDBReplicatorJobsCrashing: There are replicator jobs crashing for a node.
- CouchDBReplicatorChangesQueuesDying: There are replicator changes queue process deaths for a node.
- CouchDBReplicatorConnectionOwnersCrashing: There are replicator connection owner process crashes for a node.
- CouchDBReplicatorConnectionWorkersCrashing: There are replicator connection worker process crashes for a node.
- CouchDBReplicatorOwnersCrashing: There are replicator connection owner process crashes for a node.
- CouchDBReplicatorWorkersCrashing: There are replicator connection worker process crashes for a node.

## Install tools

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: 'ApacheCouchDBAlerts',
rules: [
{
alert: 'CouchDBUnhealthyCluster',
expr: |||
min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s
||| % $._config,
min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -19,14 +19,14 @@
(
'{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' +
'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh4xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -37,14 +37,14 @@
(
'{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh5xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -55,14 +55,14 @@
(
'{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBModerateRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s
||| % $._config,
sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsWarningRequestLatency5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -73,14 +73,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHighRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s
||| % $._config,
sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsCriticalRequestLatency5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -91,14 +91,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBManyReplicatorJobsPending',
expr: |||
sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s
||| % $._config,
sum by(job, instance) (couchdb_couch_replicator_jobs_pending{%(filteringSelector)s}) > %(alertsWarningPendingReplicatorJobs5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -109,14 +109,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorJobsCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total{%(filteringSelector)s}[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -127,14 +127,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorChangesQueuesDying',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total{%(filteringSelector)s}[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -145,14 +145,14 @@
(
'{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionOwnersCrashing',
alert: 'CouchDBReplicatorOwnersCrashing',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would changing the alert name affect existing installations? What if someone uninstalls or upgrades an integration, would they get duplicate alerts?

Looking in @Dasomeone for more context.

expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -163,14 +163,14 @@
(
'{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionWorkersCrashing',
alert: 'CouchDBReplicatorWorkersCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -181,7 +181,7 @@
(
'{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. '
) % $._config,
) % this.config,
},
},
],
Expand Down
63 changes: 43 additions & 20 deletions apache-couchdb-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
{
_config+:: {
enableMultiCluster: false,
couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
multiClusterSelector: 'job=~"$job"',
local this = self,
filteringSelector: 'job="integrations/apache-couchdb"',
groupLabels: ['job', 'couchdb_cluster', 'cluster'],
logLabels: ['job', 'cluster', 'instance'],
instanceLabels: ['instance'],

dashboardTags: ['apache-couchdb-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardTags: ['apache-couchdb-mixin'],
uid: 'couchdb',
dashboardNamePrefix: 'Apache CouchDB',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: [
'prometheus',
/*
* the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin.
* i.e. couchdb_open_os_files_total => couchdb_open_os_files
* This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected.
* This was an identified as a noticeable change from 3.3.0 to 3.5.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! 🚀
Can you call this out in the readme as well please? E.g. just what versions are supported, and what the different metricSources are for

*/
'prometheusWithTotal',
],

//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in milliseconds, which means that the alert threshold is broken.

alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,

// Signals configuration
signals+: {
overview: (import './signals/overview.libsonnet')(this),
nodes: (import './signals/nodes.libsonnet')(this),
replicator: (import './signals/replicator.libsonnet')(this),
},
}
107 changes: 107 additions & 0 deletions apache-couchdb-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
local g = import './g.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,

new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'couchdb-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overview,
this.grafana.rows.overviewRequests,
this.grafana.rows.overviewReplication,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_overview',
tags,
links { couchdbOverview+:: {} },
annotations,
timezone,
refresh,
period
),

'couchdb-nodes.json':
g.dashboard.new(prefix + ' nodes')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.nodes,
this.grafana.rows.nodeRequests,
this.grafana.rows.nodeLogs,
],
),
),
) + root.applyCommon(
vars.multiInstance,
uid + '_nodes',
tags,
links { couchdbNodes+:: {} },
annotations,
timezone,
refresh,
period
),

}
+ if this.config.enableLokiLogs then {
'couchdb-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { couchdbLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading