-
Notifications
You must be signed in to change notification settings - Fork 178
chore: Modernize the Apache CouchDB mixin #1522
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 13 commits
8cad84a
c3f6bb4
a969714
2c6abca
d9e918d
17e7fa2
7006703
092776c
9186490
f492f27
467d456
e6a5654
7f424aa
58ca718
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,14 @@ | ||
| { | ||
| prometheusAlerts+:: { | ||
| new(this): { | ||
| groups+: [ | ||
| { | ||
| name: 'ApacheCouchDBAlerts', | ||
| rules: [ | ||
| { | ||
| alert: 'CouchDBUnhealthyCluster', | ||
| expr: ||| | ||
| min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s | ||
| ||| % $._config, | ||
| min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
|
|
@@ -19,14 +19,14 @@ | |
| ( | ||
| '{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' + | ||
| 'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBHigh4xxResponseCodes', | ||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -37,14 +37,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBHigh5xxResponseCodes', | ||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s,code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
|
|
@@ -55,14 +55,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBModerateRequestLatency', | ||
| expr: ||| | ||
| sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsWarningRequestLatency5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -73,14 +73,14 @@ | |
| ( | ||
| 'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarningRequestLatency5m)sms. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBHighRequestLatency', | ||
| expr: ||| | ||
| sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsCriticalRequestLatency5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
|
|
@@ -91,14 +91,14 @@ | |
| ( | ||
| 'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBManyReplicatorJobsPending', | ||
| expr: ||| | ||
| sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (couchdb_couch_replicator_jobs_pending{%(filteringSelector)s}) > %(alertsWarningPendingReplicatorJobs5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -109,14 +109,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBReplicatorJobsCrashing', | ||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total{%(filteringSelector)s}[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
|
|
@@ -127,14 +127,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBReplicatorChangesQueuesDying', | ||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total{%(filteringSelector)s}[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -145,14 +145,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBReplicatorConnectionOwnersCrashing', | ||
| alert: 'CouchDBReplicatorOwnersCrashing', | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would changing the alert name affect existing installations? What if someone uninstalls or upgrades an integration, would they get duplicate alerts? Looking in @Dasomeone for more context. |
||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -163,14 +163,14 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'CouchDBReplicatorConnectionWorkersCrashing', | ||
| alert: 'CouchDBReplicatorWorkersCrashing', | ||
| expr: ||| | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s | ||
| ||| % $._config, | ||
| sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s | ||
| ||| % this.config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -181,7 +181,7 @@ | |
| ( | ||
| '{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' + | ||
| 'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. ' | ||
| ) % $._config, | ||
| ) % this.config, | ||
| }, | ||
| }, | ||
| ], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,26 +1,49 @@ | ||
| { | ||
| _config+:: { | ||
| enableMultiCluster: false, | ||
| couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', | ||
| multiClusterSelector: 'job=~"$job"', | ||
| local this = self, | ||
| filteringSelector: 'job="integrations/apache-couchdb"', | ||
| groupLabels: ['job', 'couchdb_cluster', 'cluster'], | ||
| logLabels: ['job', 'cluster', 'instance'], | ||
| instanceLabels: ['instance'], | ||
|
|
||
| dashboardTags: ['apache-couchdb-mixin'], | ||
| dashboardPeriod: 'now-1h', | ||
| dashboardTimezone: 'default', | ||
| dashboardRefresh: '1m', | ||
| dashboardTags: ['apache-couchdb-mixin'], | ||
| uid: 'couchdb', | ||
| dashboardNamePrefix: 'Apache CouchDB', | ||
| dashboardPeriod: 'now-1h', | ||
| dashboardTimezone: 'default', | ||
| dashboardRefresh: '1m', | ||
| metricsSource: [ | ||
| 'prometheus', | ||
| /* | ||
| * the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin. | ||
| * i.e. couchdb_open_os_files_total => couchdb_open_os_files | ||
| * This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected. | ||
| * This was an identified as a noticeable change from 3.3.0 to 3.5.0 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you! 🚀 |
||
| */ | ||
| 'prometheusWithTotal', | ||
| ], | ||
|
|
||
| //alert thresholds | ||
| alertsCriticalClusterIsUnstable5m: 1, //1 is stable | ||
| alertsWarning4xxResponseCodes5m: 5, | ||
| alertsCritical5xxResponseCodes5m: 0, | ||
| alertsWarningRequestLatency5m: 500, //ms | ||
| alertsCriticalRequestLatency5m: 1000, //ms | ||
| alertsWarningPendingReplicatorJobs5m: 10, | ||
| alertsCriticalCrashingReplicatorJobs5m: 0, | ||
| alertsWarningDyingReplicatorChangesQueues5m: 0, | ||
| alertsWarningCrashingReplicatorConnectionOwners5m: 0, | ||
| alertsWarningCrashingReplicatorConnectionWorkers5m: 0, | ||
| // Logging configuration | ||
| enableLokiLogs: true, | ||
| extraLogLabels: ['level'], | ||
| logsVolumeGroupBy: 'level', | ||
| showLogsVolume: true, | ||
|
|
||
| enableLokiLogs: true, | ||
| //alert thresholds | ||
| alertsCriticalClusterIsUnstable5m: 1, //1 is stable | ||
| alertsWarning4xxResponseCodes5m: 5, | ||
| alertsCritical5xxResponseCodes5m: 0, | ||
| alertsWarningRequestLatency5m: 500, //ms | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is in milliseconds, which means that the alert threshold is broken. |
||
| alertsCriticalRequestLatency5m: 1000, //ms | ||
| alertsWarningPendingReplicatorJobs5m: 10, | ||
| alertsCriticalCrashingReplicatorJobs5m: 0, | ||
| alertsWarningDyingReplicatorChangesQueues5m: 0, | ||
| alertsWarningCrashingReplicatorConnectionOwners5m: 0, | ||
| alertsWarningCrashingReplicatorConnectionWorkers5m: 0, | ||
|
|
||
| // Signals configuration | ||
| signals+: { | ||
| overview: (import './signals/overview.libsonnet')(this), | ||
| nodes: (import './signals/nodes.libsonnet')(this), | ||
| replicator: (import './signals/replicator.libsonnet')(this), | ||
| }, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| local g = import './g.libsonnet'; | ||
| local logslib = import 'logs-lib/logs/main.libsonnet'; | ||
|
|
||
| { | ||
| local root = self, | ||
|
|
||
| new(this):: | ||
| local prefix = this.config.dashboardNamePrefix; | ||
| local links = this.grafana.links; | ||
| local tags = this.config.dashboardTags; | ||
| local uid = g.util.string.slugify(this.config.uid); | ||
| local vars = this.grafana.variables; | ||
| local annotations = this.grafana.annotations; | ||
| local refresh = this.config.dashboardRefresh; | ||
| local period = this.config.dashboardPeriod; | ||
| local timezone = this.config.dashboardTimezone; | ||
| { | ||
| 'couchdb-overview.json': | ||
| g.dashboard.new(prefix + ' overview') | ||
| + g.dashboard.withPanels( | ||
| g.util.panel.resolveCollapsedFlagOnRows( | ||
| g.util.grid.wrapPanels( | ||
| [ | ||
| this.grafana.rows.overview, | ||
| this.grafana.rows.overviewRequests, | ||
| this.grafana.rows.overviewReplication, | ||
| ] | ||
| ) | ||
| ) | ||
| ) + root.applyCommon( | ||
| vars.multiInstance, | ||
| uid + '_overview', | ||
| tags, | ||
| links { couchdbOverview+:: {} }, | ||
| annotations, | ||
| timezone, | ||
| refresh, | ||
| period | ||
| ), | ||
|
|
||
| 'couchdb-nodes.json': | ||
| g.dashboard.new(prefix + ' nodes') | ||
| + g.dashboard.withPanels( | ||
| g.util.panel.resolveCollapsedFlagOnRows( | ||
| g.util.grid.wrapPanels( | ||
| [ | ||
| this.grafana.rows.nodes, | ||
| this.grafana.rows.nodeRequests, | ||
| this.grafana.rows.nodeLogs, | ||
| ], | ||
| ), | ||
| ), | ||
| ) + root.applyCommon( | ||
| vars.multiInstance, | ||
| uid + '_nodes', | ||
| tags, | ||
| links { couchdbNodes+:: {} }, | ||
| annotations, | ||
| timezone, | ||
| refresh, | ||
| period | ||
| ), | ||
|
|
||
| } | ||
| + if this.config.enableLokiLogs then { | ||
| 'couchdb-logs.json': | ||
| logslib.new( | ||
| prefix + ' logs', | ||
| datasourceName=this.grafana.variables.datasources.loki.name, | ||
| datasourceRegex=this.grafana.variables.datasources.loki.regex, | ||
| filterSelector=this.config.filteringSelector, | ||
| labels=this.config.groupLabels + this.config.extraLogLabels, | ||
| formatParser=null, | ||
| showLogsVolume=this.config.showLogsVolume, | ||
| ) | ||
| { | ||
| dashboards+: | ||
| { | ||
| logs+: | ||
| root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { couchdbLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period), | ||
| }, | ||
| panels+: | ||
| { | ||
| logs+: | ||
| g.panel.logs.options.withEnableLogDetails(true) | ||
| + g.panel.logs.options.withShowTime(false) | ||
| + g.panel.logs.options.withWrapLogMessage(false), | ||
| }, | ||
| variables+: { | ||
| toArray+: [ | ||
| this.grafana.variables.datasources.prometheus { hide: 2 }, | ||
| ], | ||
| }, | ||
| }.dashboards.logs, | ||
| } | ||
| else {}, | ||
|
|
||
| applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): | ||
| g.dashboard.withTags(tags) | ||
| + g.dashboard.withUid(uid) | ||
| + g.dashboard.withLinks(std.objectValues(links)) | ||
| + g.dashboard.withTimezone(timezone) | ||
| + g.dashboard.withRefresh(refresh) | ||
| + g.dashboard.time.withFrom(period) | ||
| + g.dashboard.withVariables(vars) | ||
| + g.dashboard.withAnnotations(std.objectValues(annotations)), | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still need to call out the version support here like I mentioned in the config file :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Called out in the readme :)