diff --git a/docs/severity.md b/docs/severity.md index 15dea78a5..97e439749 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -269,11 +269,11 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |AWS EFS used space|X|X|-|-|-| -|AWS EFS percent of io limit|-|X|X|-|-| -|AWS EFS percent of read throughput|-|-|X|X|-| -|AWS EFS percent of write throughput|-|-|X|X|-| -|AWS EFS percent of permitted throughput|-|X|X|-|-| -|AWS EFS burst credit balance|-|X|-|-|-| +|AWS EFS percent of io limit|X|X|-|-|-| +|AWS EFS percent of read throughput|X|X|-|-|-| +|AWS EFS percent of write throughput|X|X|-|-|-| +|AWS EFS percent of permitted throughput|X|X|-|-|-| +|AWS EFS burst credit balance|X|-|-|-|-| ## integration_aws-elasticache-common diff --git a/modules/integration_aws-alb/conf/00-heartbeat.yaml b/modules/integration_aws-alb/conf/00-heartbeat.yaml index e4bbd449d..bc8004140 100644 --- a/modules/integration_aws-alb/conf/00-heartbeat.yaml +++ b/modules/integration_aws-alb/conf/00-heartbeat.yaml @@ -4,6 +4,7 @@ name: heartbeat transformation: false aggregation: ".mean(by=['LoadBalancer'])" filtering: "filter('namespace', 'AWS/ApplicationELB')" +condition: "var.heartbeat_detector_enabled" signals: signal: diff --git a/modules/integration_aws-alb/conf/01-latency.yaml b/modules/integration_aws-alb/conf/01-latency.yaml index 5f4ea61c8..557d462ca 100644 --- a/modules/integration_aws-alb/conf/01-latency.yaml +++ b/modules/integration_aws-alb/conf/01-latency.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB')" value_unit: "Second" +condition: "var.latency_detector_enabled" signals: signal: diff --git a/modules/integration_aws-alb/conf/02-lb-5xx.yaml b/modules/integration_aws-alb/conf/02-lb-5xx.yaml index 8ef21a35e..3ecbd045e 100644 --- a/modules/integration_aws-alb/conf/02-lb-5xx.yaml +++ b/modules/integration_aws-alb/conf/02-lb-5xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.lb_5xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/03-lb-4xx.yaml b/modules/integration_aws-alb/conf/03-lb-4xx.yaml index d31fe8d70..ddcbeab2b 100644 --- a/modules/integration_aws-alb/conf/03-lb-4xx.yaml +++ b/modules/integration_aws-alb/conf/03-lb-4xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.lb_4xx_detector_enabled" signals: errors: @@ -25,7 +26,6 @@ rules: lasting_duration: 15m lasting_at_least: 0.9 append_condition: and when(requests > ${var.minimum_traffic}) - disabled: true major: threshold: 95 comparator: ">" diff --git a/modules/integration_aws-alb/conf/04-target-5xx.yaml b/modules/integration_aws-alb/conf/04-target-5xx.yaml index 8d8f3c1d9..584423b13 100644 --- a/modules/integration_aws-alb/conf/04-target-5xx.yaml +++ b/modules/integration_aws-alb/conf/04-target-5xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.target_5xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/05-target-4xx.yaml b/modules/integration_aws-alb/conf/05-target-4xx.yaml index a7d96384a..1e6fff490 100644 --- a/modules/integration_aws-alb/conf/05-target-4xx.yaml +++ b/modules/integration_aws-alb/conf/05-target-4xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.target_4xx_detector_enabled" signals: errors: @@ -25,7 +26,6 @@ rules: lasting_duration: 15m lasting_at_least: 0.9 append_condition: and when(requests > ${var.minimum_traffic}) - disabled: true major: threshold: 95 comparator: ">" diff --git a/modules/integration_aws-alb/conf/06-healthy.yaml b/modules/integration_aws-alb/conf/06-healthy.yaml index 4f75d0ab3..47eddfc0d 100644 --- a/modules/integration_aws-alb/conf/06-healthy.yaml +++ b/modules/integration_aws-alb/conf/06-healthy.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.healthy_detector_enabled" signals: healthy: diff --git a/modules/integration_aws-alb/detectors-gen.tf b/modules/integration_aws-alb/detectors-gen.tf index 578805590..ed6ded754 100644 --- a/modules/integration_aws-alb/detectors-gen.tf +++ b/modules/integration_aws-alb/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "latency" { + count = (var.latency_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target response time") authorized_writer_teams = var.authorized_writer_teams @@ -74,6 +78,8 @@ EOF } resource "signalfx_detector" "alb_5xx" { + count = (var.lb_5xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB 5xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -122,6 +128,8 @@ EOF } resource "signalfx_detector" "alb_4xx" { + count = (var.lb_4xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB 4xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -183,6 +191,8 @@ EOF } resource "signalfx_detector" "target_5xx" { + count = (var.target_5xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target 5xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -231,6 +241,8 @@ EOF } resource "signalfx_detector" "target_4xx" { + count = (var.target_4xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target 4xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -292,6 +304,8 @@ EOF } resource "signalfx_detector" "healthy" { + count = (var.healthy_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB healthy instances percentage") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-alb/variables-gen.tf b/modules/integration_aws-alb/variables-gen.tf index 83af98b31..cf7730c4b 100644 --- a/modules/integration_aws-alb/variables-gen.tf +++ b/modules/integration_aws-alb/variables-gen.tf @@ -269,7 +269,7 @@ variable "alb_4xx_disabled" { variable "alb_4xx_disabled_critical" { description = "Disable critical alerting rule for alb_4xx detector" type = bool - default = true + default = null } variable "alb_4xx_disabled_major" { @@ -472,7 +472,7 @@ variable "target_4xx_disabled" { variable "target_4xx_disabled_critical" { description = "Disable critical alerting rule for target_4xx detector" type = bool - default = true + default = null } variable "target_4xx_disabled_major" { diff --git a/modules/integration_aws-alb/variables.tf b/modules/integration_aws-alb/variables.tf index 696da8ee1..3f74ec7e7 100644 --- a/modules/integration_aws-alb/variables.tf +++ b/modules/integration_aws-alb/variables.tf @@ -1,3 +1,45 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "latency_detector_enabled" { + description = "Enable latency detector" + type = bool + default = true +} + +variable "lb_5xx_detector_enabled" { + description = "Enable lb 5xx detector" + type = bool + default = true +} + +variable "lb_4xx_detector_enabled" { + description = "Enable lb 4xx detector" + type = bool + default = true +} + +variable "target_5xx_detector_enabled" { + description = "Enable target 5xx detector" + type = bool + default = true +} + +variable "target_4xx_detector_enabled" { + description = "Enable target 4xx detector" + type = bool + default = true +} + +variable "healthy_detector_enabled" { + description = "Enable healthy detector" + type = bool + default = true +} + # Module specific variable "minimum_traffic" { diff --git a/modules/integration_aws-efs/README.md b/modules/integration_aws-efs/README.md index 91dccd070..5562fb540 100644 --- a/modules/integration_aws-efs/README.md +++ b/modules/integration_aws-efs/README.md @@ -28,14 +28,10 @@ existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/G module "signalfx-detectors-integration-aws-efs" { source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_aws-efs?ref={revision}" - environment = var.environment - notifications = local.notifications - used_space_threshold_major = 42 - write_throughput_threshold_minor = 42 - read_throughput_threshold_minor = 42 - read_throughput_threshold_warning = 42 - write_throughput_threshold_warning = 42 - used_space_threshold_critical = 42 + environment = var.environment + notifications = local.notifications + used_space_threshold_major = 42 + used_space_threshold_critical = 42 } ``` @@ -87,11 +83,11 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |AWS EFS used space|X|X|-|-|-| -|AWS EFS percent of io limit|-|X|X|-|-| -|AWS EFS percent of read throughput|-|-|X|X|-| -|AWS EFS percent of write throughput|-|-|X|X|-| -|AWS EFS percent of permitted throughput|-|X|X|-|-| -|AWS EFS burst credit balance|-|X|-|-|-| +|AWS EFS percent of io limit|X|X|-|-|-| +|AWS EFS percent of read throughput|X|X|-|-|-| +|AWS EFS percent of write throughput|X|X|-|-|-| +|AWS EFS percent of permitted throughput|X|X|-|-|-| +|AWS EFS burst credit balance|X|-|-|-|-| ## How to collect required metrics? diff --git a/modules/integration_aws-efs/conf/01-used-space.yaml b/modules/integration_aws-efs/conf/01-used-space.yaml index b1453ff46..4ea948ab3 100644 --- a/modules/integration_aws-efs/conf/01-used-space.yaml +++ b/modules/integration_aws-efs/conf/01-used-space.yaml @@ -2,7 +2,7 @@ module: "AWS EFS" name: "Used Space" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "Gibibyte" -transformation: ".max(over='15m')" +condition: "var.used_space_detector_enabled" signals: used_space: metric: "StorageBytes" @@ -13,6 +13,8 @@ signals: rules: critical: comparator: ">" + lasting_duration: "15m" major: comparator: ">" + lasting_duration: "15m" dependency: critical diff --git a/modules/integration_aws-efs/conf/02-io-limit.yaml b/modules/integration_aws-efs/conf/02-io-limit.yaml index dff96360a..01df7833f 100644 --- a/modules/integration_aws-efs/conf/02-io-limit.yaml +++ b/modules/integration_aws-efs/conf/02-io-limit.yaml @@ -3,17 +3,19 @@ name: "Percent of IO Limit" id: "io_limit" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".mean(over='30m')" +condition: "var.io_limit_detector_enabled" tip: "If you reach too often the limit with current General Purpose mode, consider moving your application to a file system using the Max I/O performance mode." signals: signal: metric: "PercentIOLimit" filter: "filter('stat', 'mean')" rules: - major: + critical: comparator: ">" threshold: 90 - minor: + lasting_duration: "30m" + major: comparator: ">" - dependency: major + dependency: critical threshold: 80 + lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index 84e109933..351d8a730 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -3,8 +3,7 @@ name: "Percent of read throughput" id: "read_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".max(over='15m')" -disabled: true +condition: "var.read_throughput_detector_enabled" signals: read: metric: "DataReadIOBytes" @@ -16,8 +15,12 @@ signals: formula: (read/total).scale(100) rules: - minor: + critical: comparator: ">" - warning: + threshold: 90 + lasting_duration: "15m" + major: + lasting_duration: "15m" comparator: ">" - dependency: minor + threshold: 80 + dependency: critical diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index acff87008..63cae419a 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -3,8 +3,7 @@ name: "Percent of write throughput" id: "write_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".max(over='15m')" -disabled: true +condition: "var.write_throughput_detector_enabled" signals: write: metric: "DataWriteIOBytes" @@ -16,8 +15,12 @@ signals: formula: (write/total).scale(100) rules: - minor: + critical: comparator: ">" - warning: + threshold: 90 + lasting_duration: "15m" + major: comparator: ">" - dependency: minor + threshold: 80 + dependency: critical + lasting_duration: "15m" diff --git a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml index 14701fc41..f5da112bc 100644 --- a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml +++ b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml @@ -2,7 +2,7 @@ module: "AWS EFS" name: "Percent of permitted throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".mean(over='30m')" +condition: "var.permitted_throughput_detector_enabled" tip: "You are consuming the entire amount of throughput allocated to your file system, In this situation, you might consider changing the file system's throughput mode to Provisioned Throughput to get higher throughput." signals: metered: @@ -14,10 +14,12 @@ signals: signal: formula: (metered/permitted.scale(60)).scale(100) rules: - major: + critical: comparator: ">" threshold: 90 - minor: + lasting_duration: "30m" + major: comparator: ">" - dependency: major + dependency: critical threshold: 80 + lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml index bf8963cc2..59e748902 100644 --- a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml +++ b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml @@ -2,13 +2,14 @@ module: "AWS EFS" name: "Burst Credit Balance" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "credits" -transformation: ".mean(over='5m')" +condition: "var.burst_credit_balance_detector_enabled" tip: "See https://docs.aws.amazon.com/efs/latest/ug/performance.html#bursting" signals: signal: metric: "BurstCreditBalance" filter: "filter('stat', 'lower')" rules: - major: + critical: comparator: "<" threshold: 1 + lasting_duration: "5m" diff --git a/modules/integration_aws-efs/detectors-gen.tf b/modules/integration_aws-efs/detectors-gen.tf index d93f0af88..e83ba88d6 100644 --- a/modules/integration_aws-efs/detectors-gen.tf +++ b/modules/integration_aws-efs/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "used_space" { + count = (var.used_space_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS used space") authorized_writer_teams = var.authorized_writer_teams @@ -46,6 +48,8 @@ EOF } resource "signalfx_detector" "io_limit" { + count = (var.io_limit_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of io limit") authorized_writer_teams = var.authorized_writer_teams @@ -60,16 +64,16 @@ resource "signalfx_detector" "io_limit" { program_text = <<-EOF base_filtering = filter('namespace', 'AWS/EFS') signal = data('PercentIOLimit', filter=base_filtering and filter('stat', 'mean') and ${module.filtering.signalflow})${var.io_limit_aggregation_function}${var.io_limit_transformation_function}.publish('signal') - detect(when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif})).publish('MAJOR') - detect(when(signal > ${var.io_limit_threshold_minor}%{if var.io_limit_lasting_duration_minor != null}, lasting='${var.io_limit_lasting_duration_minor}', at_least=${var.io_limit_at_least_percentage_minor}%{endif}) and (not when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal > ${var.io_limit_threshold_critical}%{if var.io_limit_lasting_duration_critical != null}, lasting='${var.io_limit_lasting_duration_critical}', at_least=${var.io_limit_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif}) and (not when(signal > ${var.io_limit_threshold_critical}%{if var.io_limit_lasting_duration_critical != null}, lasting='${var.io_limit_lasting_duration_critical}', at_least=${var.io_limit_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.io_limit_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.io_limit_disabled_major, var.io_limit_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.io_limit_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.io_limit_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.io_limit_disabled_critical, var.io_limit_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.io_limit_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.io_limit_runbook_url, var.runbook_url), "") tip = var.io_limit_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -77,11 +81,11 @@ EOF } rule { - description = "is too high > ${var.io_limit_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.io_limit_disabled_minor, var.io_limit_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.io_limit_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.io_limit_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.io_limit_disabled_major, var.io_limit_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.io_limit_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.io_limit_runbook_url, var.runbook_url), "") tip = var.io_limit_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -92,6 +96,8 @@ EOF } resource "signalfx_detector" "read_throughput" { + count = (var.read_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of read throughput") authorized_writer_teams = var.authorized_writer_teams @@ -108,16 +114,16 @@ resource "signalfx_detector" "read_throughput" { read = data('DataReadIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.read_throughput_aggregation_function}${var.read_throughput_transformation_function} total = data('TotalIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.read_throughput_aggregation_function}${var.read_throughput_transformation_function} signal = (read/total).scale(100).publish('signal') - detect(when(signal > ${var.read_throughput_threshold_minor}%{if var.read_throughput_lasting_duration_minor != null}, lasting='${var.read_throughput_lasting_duration_minor}', at_least=${var.read_throughput_at_least_percentage_minor}%{endif})).publish('MINOR') - detect(when(signal > ${var.read_throughput_threshold_warning}%{if var.read_throughput_lasting_duration_warning != null}, lasting='${var.read_throughput_lasting_duration_warning}', at_least=${var.read_throughput_at_least_percentage_warning}%{endif}) and (not when(signal > ${var.read_throughput_threshold_minor}%{if var.read_throughput_lasting_duration_minor != null}, lasting='${var.read_throughput_lasting_duration_minor}', at_least=${var.read_throughput_at_least_percentage_minor}%{endif}))).publish('WARN') + detect(when(signal > ${var.read_throughput_threshold_critical}%{if var.read_throughput_lasting_duration_critical != null}, lasting='${var.read_throughput_lasting_duration_critical}', at_least=${var.read_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.read_throughput_threshold_major}%{if var.read_throughput_lasting_duration_major != null}, lasting='${var.read_throughput_lasting_duration_major}', at_least=${var.read_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.read_throughput_threshold_critical}%{if var.read_throughput_lasting_duration_critical != null}, lasting='${var.read_throughput_lasting_duration_critical}', at_least=${var.read_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.read_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.read_throughput_disabled_minor, var.read_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.read_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.read_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.read_throughput_disabled_critical, var.read_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.read_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.read_throughput_runbook_url, var.runbook_url), "") tip = var.read_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -125,11 +131,11 @@ EOF } rule { - description = "is too high > ${var.read_throughput_threshold_warning}%" - severity = "Warning" - detect_label = "WARN" - disabled = coalesce(var.read_throughput_disabled_warning, var.read_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.read_throughput_notifications, "warning", []), var.notifications.warning), null) + description = "is too high > ${var.read_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.read_throughput_disabled_major, var.read_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.read_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.read_throughput_runbook_url, var.runbook_url), "") tip = var.read_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -140,6 +146,8 @@ EOF } resource "signalfx_detector" "write_throughput" { + count = (var.write_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of write throughput") authorized_writer_teams = var.authorized_writer_teams @@ -156,16 +164,16 @@ resource "signalfx_detector" "write_throughput" { write = data('DataWriteIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.write_throughput_aggregation_function}${var.write_throughput_transformation_function} total = data('TotalIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.write_throughput_aggregation_function}${var.write_throughput_transformation_function} signal = (write/total).scale(100).publish('signal') - detect(when(signal > ${var.write_throughput_threshold_minor}%{if var.write_throughput_lasting_duration_minor != null}, lasting='${var.write_throughput_lasting_duration_minor}', at_least=${var.write_throughput_at_least_percentage_minor}%{endif})).publish('MINOR') - detect(when(signal > ${var.write_throughput_threshold_warning}%{if var.write_throughput_lasting_duration_warning != null}, lasting='${var.write_throughput_lasting_duration_warning}', at_least=${var.write_throughput_at_least_percentage_warning}%{endif}) and (not when(signal > ${var.write_throughput_threshold_minor}%{if var.write_throughput_lasting_duration_minor != null}, lasting='${var.write_throughput_lasting_duration_minor}', at_least=${var.write_throughput_at_least_percentage_minor}%{endif}))).publish('WARN') + detect(when(signal > ${var.write_throughput_threshold_critical}%{if var.write_throughput_lasting_duration_critical != null}, lasting='${var.write_throughput_lasting_duration_critical}', at_least=${var.write_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.write_throughput_threshold_major}%{if var.write_throughput_lasting_duration_major != null}, lasting='${var.write_throughput_lasting_duration_major}', at_least=${var.write_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.write_throughput_threshold_critical}%{if var.write_throughput_lasting_duration_critical != null}, lasting='${var.write_throughput_lasting_duration_critical}', at_least=${var.write_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.write_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.write_throughput_disabled_minor, var.write_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.write_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.write_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.write_throughput_disabled_critical, var.write_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.write_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.write_throughput_runbook_url, var.runbook_url), "") tip = var.write_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -173,11 +181,11 @@ EOF } rule { - description = "is too high > ${var.write_throughput_threshold_warning}%" - severity = "Warning" - detect_label = "WARN" - disabled = coalesce(var.write_throughput_disabled_warning, var.write_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.write_throughput_notifications, "warning", []), var.notifications.warning), null) + description = "is too high > ${var.write_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.write_throughput_disabled_major, var.write_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.write_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.write_throughput_runbook_url, var.runbook_url), "") tip = var.write_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -188,6 +196,8 @@ EOF } resource "signalfx_detector" "percent_of_permitted_throughput" { + count = (var.permitted_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of permitted throughput") authorized_writer_teams = var.authorized_writer_teams @@ -204,16 +214,16 @@ resource "signalfx_detector" "percent_of_permitted_throughput" { metered = data('MeteredIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.percent_of_permitted_throughput_aggregation_function}${var.percent_of_permitted_throughput_transformation_function} permitted = data('PermittedThroughput', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.percent_of_permitted_throughput_aggregation_function}${var.percent_of_permitted_throughput_transformation_function} signal = (metered/permitted.scale(60)).scale(100).publish('signal') - detect(when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif})).publish('MAJOR') - detect(when(signal > ${var.percent_of_permitted_throughput_threshold_minor}%{if var.percent_of_permitted_throughput_lasting_duration_minor != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_minor}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_minor}%{endif}) and (not when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal > ${var.percent_of_permitted_throughput_threshold_critical}%{if var.percent_of_permitted_throughput_lasting_duration_critical != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_critical}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.percent_of_permitted_throughput_threshold_critical}%{if var.percent_of_permitted_throughput_lasting_duration_critical != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_critical}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.percent_of_permitted_throughput_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.percent_of_permitted_throughput_disabled_major, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.percent_of_permitted_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.percent_of_permitted_throughput_disabled_critical, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.percent_of_permitted_throughput_runbook_url, var.runbook_url), "") tip = var.percent_of_permitted_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -221,11 +231,11 @@ EOF } rule { - description = "is too high > ${var.percent_of_permitted_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.percent_of_permitted_throughput_disabled_minor, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.percent_of_permitted_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.percent_of_permitted_throughput_disabled_major, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.percent_of_permitted_throughput_runbook_url, var.runbook_url), "") tip = var.percent_of_permitted_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -236,6 +246,8 @@ EOF } resource "signalfx_detector" "burst_credit_balance" { + count = (var.burst_credit_balance_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS burst credit balance") authorized_writer_teams = var.authorized_writer_teams @@ -250,15 +262,15 @@ resource "signalfx_detector" "burst_credit_balance" { program_text = <<-EOF base_filtering = filter('namespace', 'AWS/EFS') signal = data('BurstCreditBalance', filter=base_filtering and filter('stat', 'lower') and ${module.filtering.signalflow})${var.burst_credit_balance_aggregation_function}${var.burst_credit_balance_transformation_function}.publish('signal') - detect(when(signal < ${var.burst_credit_balance_threshold_major}%{if var.burst_credit_balance_lasting_duration_major != null}, lasting='${var.burst_credit_balance_lasting_duration_major}', at_least=${var.burst_credit_balance_at_least_percentage_major}%{endif})).publish('MAJOR') + detect(when(signal < ${var.burst_credit_balance_threshold_critical}%{if var.burst_credit_balance_lasting_duration_critical != null}, lasting='${var.burst_credit_balance_lasting_duration_critical}', at_least=${var.burst_credit_balance_at_least_percentage_critical}%{endif})).publish('CRIT') EOF rule { - description = "is too low < ${var.burst_credit_balance_threshold_major}credits" - severity = "Major" - detect_label = "MAJOR" + description = "is too low < ${var.burst_credit_balance_threshold_critical}credits" + severity = "Critical" + detect_label = "CRIT" disabled = coalesce(var.burst_credit_balance_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.burst_credit_balance_notifications, "major", []), var.notifications.major), null) + notifications = try(coalescelist(lookup(var.burst_credit_balance_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.burst_credit_balance_runbook_url, var.runbook_url), "") tip = var.burst_credit_balance_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject diff --git a/modules/integration_aws-efs/variables-gen.tf b/modules/integration_aws-efs/variables-gen.tf index f52d0a029..4cf22ccfb 100644 --- a/modules/integration_aws-efs/variables-gen.tf +++ b/modules/integration_aws-efs/variables-gen.tf @@ -15,7 +15,7 @@ variable "used_space_aggregation_function" { variable "used_space_transformation_function" { description = "Transformation function for used_space detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "used_space_max_delay" { @@ -62,7 +62,7 @@ variable "used_space_threshold_critical" { variable "used_space_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "used_space_at_least_percentage_critical" { @@ -78,7 +78,7 @@ variable "used_space_threshold_major" { variable "used_space_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "used_space_at_least_percentage_major" { @@ -103,7 +103,7 @@ variable "io_limit_aggregation_function" { variable "io_limit_transformation_function" { description = "Transformation function for io_limit detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='30m')" + default = "" } variable "io_limit_max_delay" { @@ -132,48 +132,48 @@ variable "io_limit_disabled" { default = null } -variable "io_limit_disabled_major" { - description = "Disable major alerting rule for io_limit detector" +variable "io_limit_disabled_critical" { + description = "Disable critical alerting rule for io_limit detector" type = bool default = null } -variable "io_limit_disabled_minor" { - description = "Disable minor alerting rule for io_limit detector" +variable "io_limit_disabled_major" { + description = "Disable major alerting rule for io_limit detector" type = bool default = null } -variable "io_limit_threshold_major" { - description = "Major threshold for io_limit detector in %" +variable "io_limit_threshold_critical" { + description = "Critical threshold for io_limit detector in %" type = number default = 90 } -variable "io_limit_lasting_duration_major" { +variable "io_limit_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } -variable "io_limit_at_least_percentage_major" { +variable "io_limit_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "io_limit_threshold_minor" { - description = "Minor threshold for io_limit detector in %" +variable "io_limit_threshold_major" { + description = "Major threshold for io_limit detector in %" type = number default = 80 } -variable "io_limit_lasting_duration_minor" { +variable "io_limit_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } -variable "io_limit_at_least_percentage_minor" { +variable "io_limit_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -195,7 +195,7 @@ variable "read_throughput_aggregation_function" { variable "read_throughput_transformation_function" { description = "Transformation function for read_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "read_throughput_max_delay" { @@ -219,49 +219,51 @@ variable "read_throughput_runbook_url" { variable "read_throughput_disabled" { description = "Disable all alerting rules for read_throughput detector" type = bool - default = true + default = null } -variable "read_throughput_disabled_minor" { - description = "Disable minor alerting rule for read_throughput detector" +variable "read_throughput_disabled_critical" { + description = "Disable critical alerting rule for read_throughput detector" type = bool default = null } -variable "read_throughput_disabled_warning" { - description = "Disable warning alerting rule for read_throughput detector" +variable "read_throughput_disabled_major" { + description = "Disable major alerting rule for read_throughput detector" type = bool default = null } -variable "read_throughput_threshold_minor" { - description = "Minor threshold for read_throughput detector in %" +variable "read_throughput_threshold_critical" { + description = "Critical threshold for read_throughput detector in %" type = number + default = 90 } -variable "read_throughput_lasting_duration_minor" { +variable "read_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "read_throughput_at_least_percentage_minor" { +variable "read_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "read_throughput_threshold_warning" { - description = "Warning threshold for read_throughput detector in %" +variable "read_throughput_threshold_major" { + description = "Major threshold for read_throughput detector in %" type = number + default = 80 } -variable "read_throughput_lasting_duration_warning" { +variable "read_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "read_throughput_at_least_percentage_warning" { +variable "read_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -283,7 +285,7 @@ variable "write_throughput_aggregation_function" { variable "write_throughput_transformation_function" { description = "Transformation function for write_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "write_throughput_max_delay" { @@ -307,49 +309,51 @@ variable "write_throughput_runbook_url" { variable "write_throughput_disabled" { description = "Disable all alerting rules for write_throughput detector" type = bool - default = true + default = null } -variable "write_throughput_disabled_minor" { - description = "Disable minor alerting rule for write_throughput detector" +variable "write_throughput_disabled_critical" { + description = "Disable critical alerting rule for write_throughput detector" type = bool default = null } -variable "write_throughput_disabled_warning" { - description = "Disable warning alerting rule for write_throughput detector" +variable "write_throughput_disabled_major" { + description = "Disable major alerting rule for write_throughput detector" type = bool default = null } -variable "write_throughput_threshold_minor" { - description = "Minor threshold for write_throughput detector in %" +variable "write_throughput_threshold_critical" { + description = "Critical threshold for write_throughput detector in %" type = number + default = 90 } -variable "write_throughput_lasting_duration_minor" { +variable "write_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "write_throughput_at_least_percentage_minor" { +variable "write_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "write_throughput_threshold_warning" { - description = "Warning threshold for write_throughput detector in %" +variable "write_throughput_threshold_major" { + description = "Major threshold for write_throughput detector in %" type = number + default = 80 } -variable "write_throughput_lasting_duration_warning" { +variable "write_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "write_throughput_at_least_percentage_warning" { +variable "write_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -371,7 +375,7 @@ variable "percent_of_permitted_throughput_aggregation_function" { variable "percent_of_permitted_throughput_transformation_function" { description = "Transformation function for percent_of_permitted_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='30m')" + default = "" } variable "percent_of_permitted_throughput_max_delay" { @@ -400,48 +404,48 @@ variable "percent_of_permitted_throughput_disabled" { default = null } -variable "percent_of_permitted_throughput_disabled_major" { - description = "Disable major alerting rule for percent_of_permitted_throughput detector" +variable "percent_of_permitted_throughput_disabled_critical" { + description = "Disable critical alerting rule for percent_of_permitted_throughput detector" type = bool default = null } -variable "percent_of_permitted_throughput_disabled_minor" { - description = "Disable minor alerting rule for percent_of_permitted_throughput detector" +variable "percent_of_permitted_throughput_disabled_major" { + description = "Disable major alerting rule for percent_of_permitted_throughput detector" type = bool default = null } -variable "percent_of_permitted_throughput_threshold_major" { - description = "Major threshold for percent_of_permitted_throughput detector in %" +variable "percent_of_permitted_throughput_threshold_critical" { + description = "Critical threshold for percent_of_permitted_throughput detector in %" type = number default = 90 } -variable "percent_of_permitted_throughput_lasting_duration_major" { +variable "percent_of_permitted_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } -variable "percent_of_permitted_throughput_at_least_percentage_major" { +variable "percent_of_permitted_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "percent_of_permitted_throughput_threshold_minor" { - description = "Minor threshold for percent_of_permitted_throughput detector in %" +variable "percent_of_permitted_throughput_threshold_major" { + description = "Major threshold for percent_of_permitted_throughput detector in %" type = number default = 80 } -variable "percent_of_permitted_throughput_lasting_duration_minor" { +variable "percent_of_permitted_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } -variable "percent_of_permitted_throughput_at_least_percentage_minor" { +variable "percent_of_permitted_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -463,7 +467,7 @@ variable "burst_credit_balance_aggregation_function" { variable "burst_credit_balance_transformation_function" { description = "Transformation function for burst_credit_balance detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" } variable "burst_credit_balance_max_delay" { @@ -492,19 +496,19 @@ variable "burst_credit_balance_disabled" { default = null } -variable "burst_credit_balance_threshold_major" { - description = "Major threshold for burst_credit_balance detector in credits" +variable "burst_credit_balance_threshold_critical" { + description = "Critical threshold for burst_credit_balance detector in credits" type = number default = 1 } -variable "burst_credit_balance_lasting_duration_major" { +variable "burst_credit_balance_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } -variable "burst_credit_balance_at_least_percentage_major" { +variable "burst_credit_balance_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 diff --git a/modules/integration_aws-efs/variables.tf b/modules/integration_aws-efs/variables.tf index 4fbfe41c0..f3d2bded6 100644 --- a/modules/integration_aws-efs/variables.tf +++ b/modules/integration_aws-efs/variables.tf @@ -1,3 +1,39 @@ +variable "used_space_detector_enabled" { + description = "Enable used space detector" + type = bool + default = true +} + +variable "io_limit_detector_enabled" { + description = "Enable io limit detector" + type = bool + default = true +} + +variable "read_throughput_detector_enabled" { + description = "Enable read throughput detector" + type = bool + default = true +} + +variable "write_throughput_detector_enabled" { + description = "Enable write throughput detector" + type = bool + default = true +} + +variable "permitted_throughput_detector_enabled" { + description = "Enable permitted throughput detector" + type = bool + default = true +} + +variable "burst_credit_balance_detector_enabled" { + description = "Enable burst credit balance detector" + type = bool + default = true +} + # Module specific variable "efs_id" { diff --git a/modules/integration_aws-nlb/README.md b/modules/integration_aws-nlb/README.md index 1f609e5b3..783174628 100644 --- a/modules/integration_aws-nlb/README.md +++ b/modules/integration_aws-nlb/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables-gen.tf](variables-gen.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. diff --git a/modules/integration_aws-nlb/conf/00-heartbeat.yaml b/modules/integration_aws-nlb/conf/00-heartbeat.yaml index 6410e216e..191a0eec9 100644 --- a/modules/integration_aws-nlb/conf/00-heartbeat.yaml +++ b/modules/integration_aws-nlb/conf/00-heartbeat.yaml @@ -4,9 +4,10 @@ name: "heartbeat" transformation: true aggregation: ".mean(by=['LoadBalancer'])" filtering: "filter('stat', 'mean') and filter('namespace', 'AWS/NetworkELB')" +condition: "var.heartbeat_detector_enabled" signals: signal: metric: "ConsumedLCUs" rules: - critical: \ No newline at end of file + critical: diff --git a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml index 6d4391355..085ae2599 100644 --- a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml +++ b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml @@ -2,9 +2,9 @@ module: "AWS NLB" name: "Healthy instances percentage" id: "no_healthy_instances" -transformation: ".min(over='5m')" aggregation: true filtering: "filter('namespace', 'AWS/NetworkELB') and (not filter('AvailabilityZone', '*'))" +condition: "var.healthy_instances_detector_enabled" signals: A: @@ -20,8 +20,10 @@ rules: critical: threshold: 1 comparator: "<" - + lasting_duration: "5min" + major: threshold: 100 comparator: "<" - dependency: "critical" \ No newline at end of file + dependency: "critical" + lasting_duration: "5min" diff --git a/modules/integration_aws-nlb/detectors-gen.tf b/modules/integration_aws-nlb/detectors-gen.tf index d53fc1cf6..1936b26ec 100644 --- a/modules/integration_aws-nlb/detectors-gen.tf +++ b/modules/integration_aws-nlb/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS NLB heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "no_healthy_instances" { + count = (var.healthy_instances_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS NLB healthy instances percentage") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-nlb/variables-gen.tf b/modules/integration_aws-nlb/variables-gen.tf index da6880eec..1555c87dc 100644 --- a/modules/integration_aws-nlb/variables-gen.tf +++ b/modules/integration_aws-nlb/variables-gen.tf @@ -65,7 +65,7 @@ variable "no_healthy_instances_aggregation_function" { variable "no_healthy_instances_transformation_function" { description = "Transformation function for no_healthy_instances detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='5m')" + default = "" } variable "no_healthy_instances_max_delay" { @@ -113,7 +113,7 @@ variable "no_healthy_instances_threshold_critical" { variable "no_healthy_instances_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5min" } variable "no_healthy_instances_at_least_percentage_critical" { @@ -130,7 +130,7 @@ variable "no_healthy_instances_threshold_major" { variable "no_healthy_instances_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5min" } variable "no_healthy_instances_at_least_percentage_major" { diff --git a/modules/integration_aws-nlb/variables.tf b/modules/integration_aws-nlb/variables.tf new file mode 100644 index 000000000..7161f5ba1 --- /dev/null +++ b/modules/integration_aws-nlb/variables.tf @@ -0,0 +1,11 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "healthy_instances_detector_enabled" { + description = "Enable healthy instances detector" + type = bool + default = true +} diff --git a/modules/integration_aws-rds-common/README.md b/modules/integration_aws-rds-common/README.md index 548798557..1becc32b4 100644 --- a/modules/integration_aws-rds-common/README.md +++ b/modules/integration_aws-rds-common/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables-gen.tf](variables-gen.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. diff --git a/modules/integration_aws-rds-common/conf/00-heartbeat.yaml b/modules/integration_aws-rds-common/conf/00-heartbeat.yaml index e886f3d63..b8f8dcd2c 100644 --- a/modules/integration_aws-rds-common/conf/00-heartbeat.yaml +++ b/modules/integration_aws-rds-common/conf/00-heartbeat.yaml @@ -4,6 +4,7 @@ name: heartbeat transformation: false aggregation: ".mean(by=['DBInstanceIdentifier'])" filtering: "filter('namespace', 'AWS/RDS')" +condition: "var.heartbeat_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/01-cpu.yaml b/modules/integration_aws-rds-common/conf/01-cpu.yaml index 39a61ec3a..b66f1a718 100644 --- a/modules/integration_aws-rds-common/conf/01-cpu.yaml +++ b/modules/integration_aws-rds-common/conf/01-cpu.yaml @@ -2,11 +2,9 @@ module: AWS RDS Common id: "cpu_usage" name: "Instance CPU" -transformation: true -aggregation: ".min(over='15m')" - filtering: "filter('namespace', 'AWS/RDS')" value_unit: "%" +condition: "var.cpu_usage_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/02-free-space.yaml b/modules/integration_aws-rds-common/conf/02-free-space.yaml index c2ba75164..22fb08c9e 100644 --- a/modules/integration_aws-rds-common/conf/02-free-space.yaml +++ b/modules/integration_aws-rds-common/conf/02-free-space.yaml @@ -3,10 +3,9 @@ id: "free_space_low" name: "Instance free space" transformation: ".scale(1/1024**3)" -aggregation: ".min(over='15m')" - filtering: "filter('namespace', 'AWS/RDS')" value_unit: "Gibibyte" +condition: "var.free_space_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml index ad1c163a8..3e32181ae 100644 --- a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml +++ b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml @@ -1,10 +1,8 @@ module: AWS RDS Common name: "Replica lag" -transformation: true -aggregation: ".min(over='5m')" - filtering: "filter('namespace', 'AWS/RDS')" +condition: "var.replica_lag_detector_enabled" signals: signal: @@ -15,7 +13,9 @@ rules: critical: threshold: 300 comparator: ">" + duration: "5m" major: threshold: 200 comparator: ">" + duration: "5m" dependency: critical diff --git a/modules/integration_aws-rds-common/conf/04-dbload.yaml b/modules/integration_aws-rds-common/conf/04-dbload.yaml index 5fb89adf9..e9446d192 100644 --- a/modules/integration_aws-rds-common/conf/04-dbload.yaml +++ b/modules/integration_aws-rds-common/conf/04-dbload.yaml @@ -2,9 +2,8 @@ module: AWS RDS Common id: dbload name: "DB Load" -transformation: true -aggregation: true filtering: "filter('namespace', 'AWS/RDS') and filter('stat', 'mean')" +condition: "var.dbload_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/detectors-gen.tf b/modules/integration_aws-rds-common/detectors-gen.tf index cccaab132..1e462c569 100644 --- a/modules/integration_aws-rds-common/detectors-gen.tf +++ b/modules/integration_aws-rds-common/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "cpu_usage" { + count = (var.cpu_usage_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common instance cpu") authorized_writer_teams = var.authorized_writer_teams @@ -74,6 +78,8 @@ EOF } resource "signalfx_detector" "free_space_low" { + count = (var.free_space_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common instance free space") authorized_writer_teams = var.authorized_writer_teams @@ -120,6 +126,8 @@ EOF } resource "signalfx_detector" "replica_lag" { + count = (var.replica_lag_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common replica lag") authorized_writer_teams = var.authorized_writer_teams @@ -161,6 +169,8 @@ EOF } resource "signalfx_detector" "dbload" { + count = (var.dbload_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common db load") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-rds-common/variables-gen.tf b/modules/integration_aws-rds-common/variables-gen.tf index 585500e5f..3ebcaf28b 100644 --- a/modules/integration_aws-rds-common/variables-gen.tf +++ b/modules/integration_aws-rds-common/variables-gen.tf @@ -53,7 +53,7 @@ variable "cpu_usage_notifications" { variable "cpu_usage_aggregation_function" { description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "cpu_usage_transformation_function" { @@ -143,7 +143,7 @@ variable "free_space_low_notifications" { variable "free_space_low_aggregation_function" { description = "Aggregation function and group by for free_space_low detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "free_space_low_transformation_function" { @@ -233,7 +233,7 @@ variable "replica_lag_notifications" { variable "replica_lag_aggregation_function" { description = "Aggregation function and group by for replica_lag detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='5m')" + default = "" } variable "replica_lag_transformation_function" { diff --git a/modules/integration_aws-rds-common/variables.tf b/modules/integration_aws-rds-common/variables.tf new file mode 100644 index 000000000..f64f54bda --- /dev/null +++ b/modules/integration_aws-rds-common/variables.tf @@ -0,0 +1,29 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "cpu_usage_detector_enabled" { + description = "Enable cpu usage detector" + type = bool + default = true +} + +variable "free_space_detector_enabled" { + description = "Enable free space detector" + type = bool + default = true +} + +variable "replica_lag_detector_enabled" { + description = "Enable replica lag detector" + type = bool + default = true +} + +variable "dbload_detector_enabled" { + description = "Enable dbload detector" + type = bool + default = true +}