From 5ee329676ef6ba180ba34f3cca4d4075595e275a Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 09:34:21 +0100 Subject: [PATCH 01/11] fix: integration_aws-efs: replace transformation with lasting property --- .../conf/01-used-space.yaml | 3 +- .../integration_aws-efs/conf/02-io-limit.yaml | 3 +- .../conf/03-throughput-read.yaml | 3 +- .../conf/04-throughput-write.yaml | 3 +- .../conf/05-permitted-throughput.yaml | 3 +- .../conf/06-burst-cedit-balance.yaml | 2 +- modules/integration_aws-efs/variables-gen.tf | 34 +++++++++---------- 7 files changed, 28 insertions(+), 23 deletions(-) diff --git a/modules/integration_aws-efs/conf/01-used-space.yaml b/modules/integration_aws-efs/conf/01-used-space.yaml index b1453ff46..f7f499038 100644 --- a/modules/integration_aws-efs/conf/01-used-space.yaml +++ b/modules/integration_aws-efs/conf/01-used-space.yaml @@ -2,7 +2,6 @@ module: "AWS EFS" name: "Used Space" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "Gibibyte" -transformation: ".max(over='15m')" signals: used_space: metric: "StorageBytes" @@ -13,6 +12,8 @@ signals: rules: critical: comparator: ">" + lasting_duration: "15m" major: comparator: ">" + lasting_duration: "15m" dependency: critical diff --git a/modules/integration_aws-efs/conf/02-io-limit.yaml b/modules/integration_aws-efs/conf/02-io-limit.yaml index dff96360a..59965b39c 100644 --- a/modules/integration_aws-efs/conf/02-io-limit.yaml +++ b/modules/integration_aws-efs/conf/02-io-limit.yaml @@ -3,7 +3,6 @@ name: "Percent of IO Limit" id: "io_limit" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".mean(over='30m')" tip: "If you reach too often the limit with current General Purpose mode, consider moving your application to a file system using the Max I/O performance mode." signals: signal: @@ -13,7 +12,9 @@ rules: major: comparator: ">" threshold: 90 + lasting_duration: "30m" minor: comparator: ">" dependency: major threshold: 80 + lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index 84e109933..354a9a40d 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -3,7 +3,6 @@ name: "Percent of read throughput" id: "read_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".max(over='15m')" disabled: true signals: read: @@ -18,6 +17,8 @@ signals: rules: minor: comparator: ">" + lasting_duration: "15m" warning: + lasting_duration: "15m" comparator: ">" dependency: minor diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index acff87008..bfd8717fa 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -3,7 +3,6 @@ name: "Percent of write throughput" id: "write_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".max(over='15m')" disabled: true signals: write: @@ -18,6 +17,8 @@ signals: rules: minor: comparator: ">" + lasting_duration: "15m" warning: comparator: ">" dependency: minor + lasting_duration: "15m" diff --git a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml index 14701fc41..cfaf1d9ee 100644 --- a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml +++ b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml @@ -2,7 +2,6 @@ module: "AWS EFS" name: "Percent of permitted throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" -transformation: ".mean(over='30m')" tip: "You are consuming the entire amount of throughput allocated to your file system, In this situation, you might consider changing the file system's throughput mode to Provisioned Throughput to get higher throughput." signals: metered: @@ -17,7 +16,9 @@ rules: major: comparator: ">" threshold: 90 + lasting_duration: "30m" minor: comparator: ">" dependency: major threshold: 80 + lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml index bf8963cc2..ccaa5be95 100644 --- a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml +++ b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml @@ -2,7 +2,6 @@ module: "AWS EFS" name: "Burst Credit Balance" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "credits" -transformation: ".mean(over='5m')" tip: "See https://docs.aws.amazon.com/efs/latest/ug/performance.html#bursting" signals: signal: @@ -12,3 +11,4 @@ rules: major: comparator: "<" threshold: 1 + lasting_duration: "5m" diff --git a/modules/integration_aws-efs/variables-gen.tf b/modules/integration_aws-efs/variables-gen.tf index f52d0a029..f6e164116 100644 --- a/modules/integration_aws-efs/variables-gen.tf +++ b/modules/integration_aws-efs/variables-gen.tf @@ -15,7 +15,7 @@ variable "used_space_aggregation_function" { variable "used_space_transformation_function" { description = "Transformation function for used_space detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "used_space_max_delay" { @@ -62,7 +62,7 @@ variable "used_space_threshold_critical" { variable "used_space_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "used_space_at_least_percentage_critical" { @@ -78,7 +78,7 @@ variable "used_space_threshold_major" { variable "used_space_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "used_space_at_least_percentage_major" { @@ -103,7 +103,7 @@ variable "io_limit_aggregation_function" { variable "io_limit_transformation_function" { description = "Transformation function for io_limit detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='30m')" + default = "" } variable "io_limit_max_delay" { @@ -153,7 +153,7 @@ variable "io_limit_threshold_major" { variable "io_limit_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "io_limit_at_least_percentage_major" { @@ -170,7 +170,7 @@ variable "io_limit_threshold_minor" { variable "io_limit_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "io_limit_at_least_percentage_minor" { @@ -195,7 +195,7 @@ variable "read_throughput_aggregation_function" { variable "read_throughput_transformation_function" { description = "Transformation function for read_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "read_throughput_max_delay" { @@ -242,7 +242,7 @@ variable "read_throughput_threshold_minor" { variable "read_throughput_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "read_throughput_at_least_percentage_minor" { @@ -258,7 +258,7 @@ variable "read_throughput_threshold_warning" { variable "read_throughput_lasting_duration_warning" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "read_throughput_at_least_percentage_warning" { @@ -283,7 +283,7 @@ variable "write_throughput_aggregation_function" { variable "write_throughput_transformation_function" { description = "Transformation function for write_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "write_throughput_max_delay" { @@ -330,7 +330,7 @@ variable "write_throughput_threshold_minor" { variable "write_throughput_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "write_throughput_at_least_percentage_minor" { @@ -346,7 +346,7 @@ variable "write_throughput_threshold_warning" { variable "write_throughput_lasting_duration_warning" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "write_throughput_at_least_percentage_warning" { @@ -371,7 +371,7 @@ variable "percent_of_permitted_throughput_aggregation_function" { variable "percent_of_permitted_throughput_transformation_function" { description = "Transformation function for percent_of_permitted_throughput detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='30m')" + default = "" } variable "percent_of_permitted_throughput_max_delay" { @@ -421,7 +421,7 @@ variable "percent_of_permitted_throughput_threshold_major" { variable "percent_of_permitted_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "percent_of_permitted_throughput_at_least_percentage_major" { @@ -438,7 +438,7 @@ variable "percent_of_permitted_throughput_threshold_minor" { variable "percent_of_permitted_throughput_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "percent_of_permitted_throughput_at_least_percentage_minor" { @@ -463,7 +463,7 @@ variable "burst_credit_balance_aggregation_function" { variable "burst_credit_balance_transformation_function" { description = "Transformation function for burst_credit_balance detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" } variable "burst_credit_balance_max_delay" { @@ -501,7 +501,7 @@ variable "burst_credit_balance_threshold_major" { variable "burst_credit_balance_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } variable "burst_credit_balance_at_least_percentage_major" { From 9ba863b3246e5293c31022eeff94b624d215e996 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 10:15:28 +0100 Subject: [PATCH 02/11] feat: integration_aws-efs: optionalize detectors --- .../conf/01-used-space.yaml | 1 + .../integration_aws-efs/conf/02-io-limit.yaml | 1 + .../conf/03-throughput-read.yaml | 1 + .../conf/04-throughput-write.yaml | 1 + .../conf/05-permitted-throughput.yaml | 1 + .../conf/06-burst-cedit-balance.yaml | 1 + modules/integration_aws-efs/detectors-gen.tf | 12 +++++++ modules/integration_aws-efs/variables.tf | 36 +++++++++++++++++++ 8 files changed, 54 insertions(+) diff --git a/modules/integration_aws-efs/conf/01-used-space.yaml b/modules/integration_aws-efs/conf/01-used-space.yaml index f7f499038..4ea948ab3 100644 --- a/modules/integration_aws-efs/conf/01-used-space.yaml +++ b/modules/integration_aws-efs/conf/01-used-space.yaml @@ -2,6 +2,7 @@ module: "AWS EFS" name: "Used Space" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "Gibibyte" +condition: "var.used_space_detector_enabled" signals: used_space: metric: "StorageBytes" diff --git a/modules/integration_aws-efs/conf/02-io-limit.yaml b/modules/integration_aws-efs/conf/02-io-limit.yaml index 59965b39c..a74594d21 100644 --- a/modules/integration_aws-efs/conf/02-io-limit.yaml +++ b/modules/integration_aws-efs/conf/02-io-limit.yaml @@ -3,6 +3,7 @@ name: "Percent of IO Limit" id: "io_limit" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" +condition: "var.io_limit_detector_enabled" tip: "If you reach too often the limit with current General Purpose mode, consider moving your application to a file system using the Max I/O performance mode." signals: signal: diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index 354a9a40d..3bece99fe 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -3,6 +3,7 @@ name: "Percent of read throughput" id: "read_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" +condition: "var.read_throughput_detector_enabled" disabled: true signals: read: diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index bfd8717fa..3447ec639 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -3,6 +3,7 @@ name: "Percent of write throughput" id: "write_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" +condition: "var.write_throughput_detector_enabled" disabled: true signals: write: diff --git a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml index cfaf1d9ee..d3e3010a1 100644 --- a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml +++ b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml @@ -2,6 +2,7 @@ module: "AWS EFS" name: "Percent of permitted throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" +condition: "var.permitted_throughput_detector_enabled" tip: "You are consuming the entire amount of throughput allocated to your file system, In this situation, you might consider changing the file system's throughput mode to Provisioned Throughput to get higher throughput." signals: metered: diff --git a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml index ccaa5be95..c070c66dd 100644 --- a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml +++ b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml @@ -2,6 +2,7 @@ module: "AWS EFS" name: "Burst Credit Balance" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "credits" +condition: "var.burst_credit_balance_detector_enabled" tip: "See https://docs.aws.amazon.com/efs/latest/ug/performance.html#bursting" signals: signal: diff --git a/modules/integration_aws-efs/detectors-gen.tf b/modules/integration_aws-efs/detectors-gen.tf index d93f0af88..4245f094d 100644 --- a/modules/integration_aws-efs/detectors-gen.tf +++ b/modules/integration_aws-efs/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "used_space" { + count = (var.used_space_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS used space") authorized_writer_teams = var.authorized_writer_teams @@ -46,6 +48,8 @@ EOF } resource "signalfx_detector" "io_limit" { + count = (var.io_limit_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of io limit") authorized_writer_teams = var.authorized_writer_teams @@ -92,6 +96,8 @@ EOF } resource "signalfx_detector" "read_throughput" { + count = (var.read_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of read throughput") authorized_writer_teams = var.authorized_writer_teams @@ -140,6 +146,8 @@ EOF } resource "signalfx_detector" "write_throughput" { + count = (var.write_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of write throughput") authorized_writer_teams = var.authorized_writer_teams @@ -188,6 +196,8 @@ EOF } resource "signalfx_detector" "percent_of_permitted_throughput" { + count = (var.permitted_throughput_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS percent of permitted throughput") authorized_writer_teams = var.authorized_writer_teams @@ -236,6 +246,8 @@ EOF } resource "signalfx_detector" "burst_credit_balance" { + count = (var.burst_credit_balance_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS EFS burst credit balance") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-efs/variables.tf b/modules/integration_aws-efs/variables.tf index 4fbfe41c0..f3d2bded6 100644 --- a/modules/integration_aws-efs/variables.tf +++ b/modules/integration_aws-efs/variables.tf @@ -1,3 +1,39 @@ +variable "used_space_detector_enabled" { + description = "Enable used space detector" + type = bool + default = true +} + +variable "io_limit_detector_enabled" { + description = "Enable io limit detector" + type = bool + default = true +} + +variable "read_throughput_detector_enabled" { + description = "Enable read throughput detector" + type = bool + default = true +} + +variable "write_throughput_detector_enabled" { + description = "Enable write throughput detector" + type = bool + default = true +} + +variable "permitted_throughput_detector_enabled" { + description = "Enable permitted throughput detector" + type = bool + default = true +} + +variable "burst_credit_balance_detector_enabled" { + description = "Enable burst credit balance detector" + type = bool + default = true +} + # Module specific variable "efs_id" { From 2f7806492836254c685121e27ea881fb421dfc66 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 16:17:07 +0100 Subject: [PATCH 03/11] fix: integration_aws-efs: move all alarms to critical/major channels --- docs/severity.md | 10 +- modules/integration_aws-efs/README.md | 10 +- .../integration_aws-efs/conf/02-io-limit.yaml | 6 +- .../conf/03-throughput-read.yaml | 6 +- .../conf/04-throughput-write.yaml | 6 +- .../conf/05-permitted-throughput.yaml | 6 +- .../conf/06-burst-cedit-balance.yaml | 2 +- modules/integration_aws-efs/detectors-gen.tf | 106 +++++++++--------- modules/integration_aws-efs/variables-gen.tf | 104 ++++++++--------- 9 files changed, 128 insertions(+), 128 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index 15dea78a5..97e439749 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -269,11 +269,11 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |AWS EFS used space|X|X|-|-|-| -|AWS EFS percent of io limit|-|X|X|-|-| -|AWS EFS percent of read throughput|-|-|X|X|-| -|AWS EFS percent of write throughput|-|-|X|X|-| -|AWS EFS percent of permitted throughput|-|X|X|-|-| -|AWS EFS burst credit balance|-|X|-|-|-| +|AWS EFS percent of io limit|X|X|-|-|-| +|AWS EFS percent of read throughput|X|X|-|-|-| +|AWS EFS percent of write throughput|X|X|-|-|-| +|AWS EFS percent of permitted throughput|X|X|-|-|-| +|AWS EFS burst credit balance|X|-|-|-|-| ## integration_aws-elasticache-common diff --git a/modules/integration_aws-efs/README.md b/modules/integration_aws-efs/README.md index 91dccd070..38e9d7f8c 100644 --- a/modules/integration_aws-efs/README.md +++ b/modules/integration_aws-efs/README.md @@ -87,11 +87,11 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |AWS EFS used space|X|X|-|-|-| -|AWS EFS percent of io limit|-|X|X|-|-| -|AWS EFS percent of read throughput|-|-|X|X|-| -|AWS EFS percent of write throughput|-|-|X|X|-| -|AWS EFS percent of permitted throughput|-|X|X|-|-| -|AWS EFS burst credit balance|-|X|-|-|-| +|AWS EFS percent of io limit|X|X|-|-|-| +|AWS EFS percent of read throughput|X|X|-|-|-| +|AWS EFS percent of write throughput|X|X|-|-|-| +|AWS EFS percent of permitted throughput|X|X|-|-|-| +|AWS EFS burst credit balance|X|-|-|-|-| ## How to collect required metrics? diff --git a/modules/integration_aws-efs/conf/02-io-limit.yaml b/modules/integration_aws-efs/conf/02-io-limit.yaml index a74594d21..01df7833f 100644 --- a/modules/integration_aws-efs/conf/02-io-limit.yaml +++ b/modules/integration_aws-efs/conf/02-io-limit.yaml @@ -10,12 +10,12 @@ signals: metric: "PercentIOLimit" filter: "filter('stat', 'mean')" rules: - major: + critical: comparator: ">" threshold: 90 lasting_duration: "30m" - minor: + major: comparator: ">" - dependency: major + dependency: critical threshold: 80 lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index 3bece99fe..8bfebc830 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -16,10 +16,10 @@ signals: formula: (read/total).scale(100) rules: - minor: + critical: comparator: ">" lasting_duration: "15m" - warning: + major: lasting_duration: "15m" comparator: ">" - dependency: minor + dependency: critical diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index 3447ec639..ebb74aaad 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -16,10 +16,10 @@ signals: formula: (write/total).scale(100) rules: - minor: + critical: comparator: ">" lasting_duration: "15m" - warning: + major: comparator: ">" - dependency: minor + dependency: critical lasting_duration: "15m" diff --git a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml index d3e3010a1..f5da112bc 100644 --- a/modules/integration_aws-efs/conf/05-permitted-throughput.yaml +++ b/modules/integration_aws-efs/conf/05-permitted-throughput.yaml @@ -14,12 +14,12 @@ signals: signal: formula: (metered/permitted.scale(60)).scale(100) rules: - major: + critical: comparator: ">" threshold: 90 lasting_duration: "30m" - minor: + major: comparator: ">" - dependency: major + dependency: critical threshold: 80 lasting_duration: "30m" diff --git a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml index c070c66dd..59e748902 100644 --- a/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml +++ b/modules/integration_aws-efs/conf/06-burst-cedit-balance.yaml @@ -9,7 +9,7 @@ signals: metric: "BurstCreditBalance" filter: "filter('stat', 'lower')" rules: - major: + critical: comparator: "<" threshold: 1 lasting_duration: "5m" diff --git a/modules/integration_aws-efs/detectors-gen.tf b/modules/integration_aws-efs/detectors-gen.tf index 4245f094d..e83ba88d6 100644 --- a/modules/integration_aws-efs/detectors-gen.tf +++ b/modules/integration_aws-efs/detectors-gen.tf @@ -64,16 +64,16 @@ resource "signalfx_detector" "io_limit" { program_text = <<-EOF base_filtering = filter('namespace', 'AWS/EFS') signal = data('PercentIOLimit', filter=base_filtering and filter('stat', 'mean') and ${module.filtering.signalflow})${var.io_limit_aggregation_function}${var.io_limit_transformation_function}.publish('signal') - detect(when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif})).publish('MAJOR') - detect(when(signal > ${var.io_limit_threshold_minor}%{if var.io_limit_lasting_duration_minor != null}, lasting='${var.io_limit_lasting_duration_minor}', at_least=${var.io_limit_at_least_percentage_minor}%{endif}) and (not when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal > ${var.io_limit_threshold_critical}%{if var.io_limit_lasting_duration_critical != null}, lasting='${var.io_limit_lasting_duration_critical}', at_least=${var.io_limit_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.io_limit_threshold_major}%{if var.io_limit_lasting_duration_major != null}, lasting='${var.io_limit_lasting_duration_major}', at_least=${var.io_limit_at_least_percentage_major}%{endif}) and (not when(signal > ${var.io_limit_threshold_critical}%{if var.io_limit_lasting_duration_critical != null}, lasting='${var.io_limit_lasting_duration_critical}', at_least=${var.io_limit_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.io_limit_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.io_limit_disabled_major, var.io_limit_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.io_limit_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.io_limit_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.io_limit_disabled_critical, var.io_limit_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.io_limit_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.io_limit_runbook_url, var.runbook_url), "") tip = var.io_limit_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -81,11 +81,11 @@ EOF } rule { - description = "is too high > ${var.io_limit_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.io_limit_disabled_minor, var.io_limit_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.io_limit_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.io_limit_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.io_limit_disabled_major, var.io_limit_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.io_limit_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.io_limit_runbook_url, var.runbook_url), "") tip = var.io_limit_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -114,16 +114,16 @@ resource "signalfx_detector" "read_throughput" { read = data('DataReadIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.read_throughput_aggregation_function}${var.read_throughput_transformation_function} total = data('TotalIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.read_throughput_aggregation_function}${var.read_throughput_transformation_function} signal = (read/total).scale(100).publish('signal') - detect(when(signal > ${var.read_throughput_threshold_minor}%{if var.read_throughput_lasting_duration_minor != null}, lasting='${var.read_throughput_lasting_duration_minor}', at_least=${var.read_throughput_at_least_percentage_minor}%{endif})).publish('MINOR') - detect(when(signal > ${var.read_throughput_threshold_warning}%{if var.read_throughput_lasting_duration_warning != null}, lasting='${var.read_throughput_lasting_duration_warning}', at_least=${var.read_throughput_at_least_percentage_warning}%{endif}) and (not when(signal > ${var.read_throughput_threshold_minor}%{if var.read_throughput_lasting_duration_minor != null}, lasting='${var.read_throughput_lasting_duration_minor}', at_least=${var.read_throughput_at_least_percentage_minor}%{endif}))).publish('WARN') + detect(when(signal > ${var.read_throughput_threshold_critical}%{if var.read_throughput_lasting_duration_critical != null}, lasting='${var.read_throughput_lasting_duration_critical}', at_least=${var.read_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.read_throughput_threshold_major}%{if var.read_throughput_lasting_duration_major != null}, lasting='${var.read_throughput_lasting_duration_major}', at_least=${var.read_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.read_throughput_threshold_critical}%{if var.read_throughput_lasting_duration_critical != null}, lasting='${var.read_throughput_lasting_duration_critical}', at_least=${var.read_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.read_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.read_throughput_disabled_minor, var.read_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.read_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.read_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.read_throughput_disabled_critical, var.read_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.read_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.read_throughput_runbook_url, var.runbook_url), "") tip = var.read_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -131,11 +131,11 @@ EOF } rule { - description = "is too high > ${var.read_throughput_threshold_warning}%" - severity = "Warning" - detect_label = "WARN" - disabled = coalesce(var.read_throughput_disabled_warning, var.read_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.read_throughput_notifications, "warning", []), var.notifications.warning), null) + description = "is too high > ${var.read_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.read_throughput_disabled_major, var.read_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.read_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.read_throughput_runbook_url, var.runbook_url), "") tip = var.read_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -164,16 +164,16 @@ resource "signalfx_detector" "write_throughput" { write = data('DataWriteIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.write_throughput_aggregation_function}${var.write_throughput_transformation_function} total = data('TotalIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.write_throughput_aggregation_function}${var.write_throughput_transformation_function} signal = (write/total).scale(100).publish('signal') - detect(when(signal > ${var.write_throughput_threshold_minor}%{if var.write_throughput_lasting_duration_minor != null}, lasting='${var.write_throughput_lasting_duration_minor}', at_least=${var.write_throughput_at_least_percentage_minor}%{endif})).publish('MINOR') - detect(when(signal > ${var.write_throughput_threshold_warning}%{if var.write_throughput_lasting_duration_warning != null}, lasting='${var.write_throughput_lasting_duration_warning}', at_least=${var.write_throughput_at_least_percentage_warning}%{endif}) and (not when(signal > ${var.write_throughput_threshold_minor}%{if var.write_throughput_lasting_duration_minor != null}, lasting='${var.write_throughput_lasting_duration_minor}', at_least=${var.write_throughput_at_least_percentage_minor}%{endif}))).publish('WARN') + detect(when(signal > ${var.write_throughput_threshold_critical}%{if var.write_throughput_lasting_duration_critical != null}, lasting='${var.write_throughput_lasting_duration_critical}', at_least=${var.write_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.write_throughput_threshold_major}%{if var.write_throughput_lasting_duration_major != null}, lasting='${var.write_throughput_lasting_duration_major}', at_least=${var.write_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.write_throughput_threshold_critical}%{if var.write_throughput_lasting_duration_critical != null}, lasting='${var.write_throughput_lasting_duration_critical}', at_least=${var.write_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.write_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.write_throughput_disabled_minor, var.write_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.write_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.write_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.write_throughput_disabled_critical, var.write_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.write_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.write_throughput_runbook_url, var.runbook_url), "") tip = var.write_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -181,11 +181,11 @@ EOF } rule { - description = "is too high > ${var.write_throughput_threshold_warning}%" - severity = "Warning" - detect_label = "WARN" - disabled = coalesce(var.write_throughput_disabled_warning, var.write_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.write_throughput_notifications, "warning", []), var.notifications.warning), null) + description = "is too high > ${var.write_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.write_throughput_disabled_major, var.write_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.write_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.write_throughput_runbook_url, var.runbook_url), "") tip = var.write_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -214,16 +214,16 @@ resource "signalfx_detector" "percent_of_permitted_throughput" { metered = data('MeteredIOBytes', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.percent_of_permitted_throughput_aggregation_function}${var.percent_of_permitted_throughput_transformation_function} permitted = data('PermittedThroughput', filter=base_filtering and filter('stat', 'sum') and ${module.filtering.signalflow})${var.percent_of_permitted_throughput_aggregation_function}${var.percent_of_permitted_throughput_transformation_function} signal = (metered/permitted.scale(60)).scale(100).publish('signal') - detect(when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif})).publish('MAJOR') - detect(when(signal > ${var.percent_of_permitted_throughput_threshold_minor}%{if var.percent_of_permitted_throughput_lasting_duration_minor != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_minor}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_minor}%{endif}) and (not when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal > ${var.percent_of_permitted_throughput_threshold_critical}%{if var.percent_of_permitted_throughput_lasting_duration_critical != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_critical}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.percent_of_permitted_throughput_threshold_major}%{if var.percent_of_permitted_throughput_lasting_duration_major != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_major}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_major}%{endif}) and (not when(signal > ${var.percent_of_permitted_throughput_threshold_critical}%{if var.percent_of_permitted_throughput_lasting_duration_critical != null}, lasting='${var.percent_of_permitted_throughput_lasting_duration_critical}', at_least=${var.percent_of_permitted_throughput_at_least_percentage_critical}%{endif}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.percent_of_permitted_throughput_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.percent_of_permitted_throughput_disabled_major, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.percent_of_permitted_throughput_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.percent_of_permitted_throughput_disabled_critical, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.percent_of_permitted_throughput_runbook_url, var.runbook_url), "") tip = var.percent_of_permitted_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -231,11 +231,11 @@ EOF } rule { - description = "is too high > ${var.percent_of_permitted_throughput_threshold_minor}%" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.percent_of_permitted_throughput_disabled_minor, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "minor", []), var.notifications.minor), null) + description = "is too high > ${var.percent_of_permitted_throughput_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.percent_of_permitted_throughput_disabled_major, var.percent_of_permitted_throughput_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.percent_of_permitted_throughput_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.percent_of_permitted_throughput_runbook_url, var.runbook_url), "") tip = var.percent_of_permitted_throughput_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -262,15 +262,15 @@ resource "signalfx_detector" "burst_credit_balance" { program_text = <<-EOF base_filtering = filter('namespace', 'AWS/EFS') signal = data('BurstCreditBalance', filter=base_filtering and filter('stat', 'lower') and ${module.filtering.signalflow})${var.burst_credit_balance_aggregation_function}${var.burst_credit_balance_transformation_function}.publish('signal') - detect(when(signal < ${var.burst_credit_balance_threshold_major}%{if var.burst_credit_balance_lasting_duration_major != null}, lasting='${var.burst_credit_balance_lasting_duration_major}', at_least=${var.burst_credit_balance_at_least_percentage_major}%{endif})).publish('MAJOR') + detect(when(signal < ${var.burst_credit_balance_threshold_critical}%{if var.burst_credit_balance_lasting_duration_critical != null}, lasting='${var.burst_credit_balance_lasting_duration_critical}', at_least=${var.burst_credit_balance_at_least_percentage_critical}%{endif})).publish('CRIT') EOF rule { - description = "is too low < ${var.burst_credit_balance_threshold_major}credits" - severity = "Major" - detect_label = "MAJOR" + description = "is too low < ${var.burst_credit_balance_threshold_critical}credits" + severity = "Critical" + detect_label = "CRIT" disabled = coalesce(var.burst_credit_balance_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.burst_credit_balance_notifications, "major", []), var.notifications.major), null) + notifications = try(coalescelist(lookup(var.burst_credit_balance_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.burst_credit_balance_runbook_url, var.runbook_url), "") tip = var.burst_credit_balance_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject diff --git a/modules/integration_aws-efs/variables-gen.tf b/modules/integration_aws-efs/variables-gen.tf index f6e164116..3e569fbc1 100644 --- a/modules/integration_aws-efs/variables-gen.tf +++ b/modules/integration_aws-efs/variables-gen.tf @@ -132,48 +132,48 @@ variable "io_limit_disabled" { default = null } -variable "io_limit_disabled_major" { - description = "Disable major alerting rule for io_limit detector" +variable "io_limit_disabled_critical" { + description = "Disable critical alerting rule for io_limit detector" type = bool default = null } -variable "io_limit_disabled_minor" { - description = "Disable minor alerting rule for io_limit detector" +variable "io_limit_disabled_major" { + description = "Disable major alerting rule for io_limit detector" type = bool default = null } -variable "io_limit_threshold_major" { - description = "Major threshold for io_limit detector in %" +variable "io_limit_threshold_critical" { + description = "Critical threshold for io_limit detector in %" type = number default = 90 } -variable "io_limit_lasting_duration_major" { +variable "io_limit_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "30m" } -variable "io_limit_at_least_percentage_major" { +variable "io_limit_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "io_limit_threshold_minor" { - description = "Minor threshold for io_limit detector in %" +variable "io_limit_threshold_major" { + description = "Major threshold for io_limit detector in %" type = number default = 80 } -variable "io_limit_lasting_duration_minor" { +variable "io_limit_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "30m" } -variable "io_limit_at_least_percentage_minor" { +variable "io_limit_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -222,46 +222,46 @@ variable "read_throughput_disabled" { default = true } -variable "read_throughput_disabled_minor" { - description = "Disable minor alerting rule for read_throughput detector" +variable "read_throughput_disabled_critical" { + description = "Disable critical alerting rule for read_throughput detector" type = bool default = null } -variable "read_throughput_disabled_warning" { - description = "Disable warning alerting rule for read_throughput detector" +variable "read_throughput_disabled_major" { + description = "Disable major alerting rule for read_throughput detector" type = bool default = null } -variable "read_throughput_threshold_minor" { - description = "Minor threshold for read_throughput detector in %" +variable "read_throughput_threshold_critical" { + description = "Critical threshold for read_throughput detector in %" type = number } -variable "read_throughput_lasting_duration_minor" { +variable "read_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "15m" } -variable "read_throughput_at_least_percentage_minor" { +variable "read_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "read_throughput_threshold_warning" { - description = "Warning threshold for read_throughput detector in %" +variable "read_throughput_threshold_major" { + description = "Major threshold for read_throughput detector in %" type = number } -variable "read_throughput_lasting_duration_warning" { +variable "read_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "15m" } -variable "read_throughput_at_least_percentage_warning" { +variable "read_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -310,46 +310,46 @@ variable "write_throughput_disabled" { default = true } -variable "write_throughput_disabled_minor" { - description = "Disable minor alerting rule for write_throughput detector" +variable "write_throughput_disabled_critical" { + description = "Disable critical alerting rule for write_throughput detector" type = bool default = null } -variable "write_throughput_disabled_warning" { - description = "Disable warning alerting rule for write_throughput detector" +variable "write_throughput_disabled_major" { + description = "Disable major alerting rule for write_throughput detector" type = bool default = null } -variable "write_throughput_threshold_minor" { - description = "Minor threshold for write_throughput detector in %" +variable "write_throughput_threshold_critical" { + description = "Critical threshold for write_throughput detector in %" type = number } -variable "write_throughput_lasting_duration_minor" { +variable "write_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "15m" } -variable "write_throughput_at_least_percentage_minor" { +variable "write_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "write_throughput_threshold_warning" { - description = "Warning threshold for write_throughput detector in %" +variable "write_throughput_threshold_major" { + description = "Major threshold for write_throughput detector in %" type = number } -variable "write_throughput_lasting_duration_warning" { +variable "write_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "15m" } -variable "write_throughput_at_least_percentage_warning" { +variable "write_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -400,48 +400,48 @@ variable "percent_of_permitted_throughput_disabled" { default = null } -variable "percent_of_permitted_throughput_disabled_major" { - description = "Disable major alerting rule for percent_of_permitted_throughput detector" +variable "percent_of_permitted_throughput_disabled_critical" { + description = "Disable critical alerting rule for percent_of_permitted_throughput detector" type = bool default = null } -variable "percent_of_permitted_throughput_disabled_minor" { - description = "Disable minor alerting rule for percent_of_permitted_throughput detector" +variable "percent_of_permitted_throughput_disabled_major" { + description = "Disable major alerting rule for percent_of_permitted_throughput detector" type = bool default = null } -variable "percent_of_permitted_throughput_threshold_major" { - description = "Major threshold for percent_of_permitted_throughput detector in %" +variable "percent_of_permitted_throughput_threshold_critical" { + description = "Critical threshold for percent_of_permitted_throughput detector in %" type = number default = 90 } -variable "percent_of_permitted_throughput_lasting_duration_major" { +variable "percent_of_permitted_throughput_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "30m" } -variable "percent_of_permitted_throughput_at_least_percentage_major" { +variable "percent_of_permitted_throughput_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "percent_of_permitted_throughput_threshold_minor" { - description = "Minor threshold for percent_of_permitted_throughput detector in %" +variable "percent_of_permitted_throughput_threshold_major" { + description = "Major threshold for percent_of_permitted_throughput detector in %" type = number default = 80 } -variable "percent_of_permitted_throughput_lasting_duration_minor" { +variable "percent_of_permitted_throughput_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "30m" } -variable "percent_of_permitted_throughput_at_least_percentage_minor" { +variable "percent_of_permitted_throughput_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -492,19 +492,19 @@ variable "burst_credit_balance_disabled" { default = null } -variable "burst_credit_balance_threshold_major" { - description = "Major threshold for burst_credit_balance detector in credits" +variable "burst_credit_balance_threshold_critical" { + description = "Critical threshold for burst_credit_balance detector in credits" type = number default = 1 } -variable "burst_credit_balance_lasting_duration_major" { +variable "burst_credit_balance_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "burst_credit_balance_at_least_percentage_major" { +variable "burst_credit_balance_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 From a3a5a84f3f3525d83170cee6cb9b321c342feb10 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 16:20:59 +0100 Subject: [PATCH 04/11] fix: integration_aws-efs: enable all alarms by default --- modules/integration_aws-efs/conf/03-throughput-read.yaml | 1 - modules/integration_aws-efs/conf/04-throughput-write.yaml | 1 - modules/integration_aws-efs/variables-gen.tf | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index 8bfebc830..a2f367866 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -4,7 +4,6 @@ id: "read_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" condition: "var.read_throughput_detector_enabled" -disabled: true signals: read: metric: "DataReadIOBytes" diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index ebb74aaad..9fa121ed5 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -4,7 +4,6 @@ id: "write_throughput" filtering: "filter('namespace', 'AWS/EFS')" value_unit: "%" condition: "var.write_throughput_detector_enabled" -disabled: true signals: write: metric: "DataWriteIOBytes" diff --git a/modules/integration_aws-efs/variables-gen.tf b/modules/integration_aws-efs/variables-gen.tf index 3e569fbc1..f63743d70 100644 --- a/modules/integration_aws-efs/variables-gen.tf +++ b/modules/integration_aws-efs/variables-gen.tf @@ -219,7 +219,7 @@ variable "read_throughput_runbook_url" { variable "read_throughput_disabled" { description = "Disable all alerting rules for read_throughput detector" type = bool - default = true + default = null } variable "read_throughput_disabled_critical" { @@ -307,7 +307,7 @@ variable "write_throughput_runbook_url" { variable "write_throughput_disabled" { description = "Disable all alerting rules for write_throughput detector" type = bool - default = true + default = null } variable "write_throughput_disabled_critical" { From 8da8f49334af656e7efe3aca37e2ae4e8f76f484 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 16:30:28 +0100 Subject: [PATCH 05/11] feat: integration_aws-efs: set read/write throughput default thresholds --- modules/integration_aws-efs/README.md | 12 ++++-------- .../integration_aws-efs/conf/03-throughput-read.yaml | 2 ++ .../conf/04-throughput-write.yaml | 2 ++ modules/integration_aws-efs/variables-gen.tf | 4 ++++ 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/modules/integration_aws-efs/README.md b/modules/integration_aws-efs/README.md index 38e9d7f8c..5562fb540 100644 --- a/modules/integration_aws-efs/README.md +++ b/modules/integration_aws-efs/README.md @@ -28,14 +28,10 @@ existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/G module "signalfx-detectors-integration-aws-efs" { source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_aws-efs?ref={revision}" - environment = var.environment - notifications = local.notifications - used_space_threshold_major = 42 - write_throughput_threshold_minor = 42 - read_throughput_threshold_minor = 42 - read_throughput_threshold_warning = 42 - write_throughput_threshold_warning = 42 - used_space_threshold_critical = 42 + environment = var.environment + notifications = local.notifications + used_space_threshold_major = 42 + used_space_threshold_critical = 42 } ``` diff --git a/modules/integration_aws-efs/conf/03-throughput-read.yaml b/modules/integration_aws-efs/conf/03-throughput-read.yaml index a2f367866..351d8a730 100644 --- a/modules/integration_aws-efs/conf/03-throughput-read.yaml +++ b/modules/integration_aws-efs/conf/03-throughput-read.yaml @@ -17,8 +17,10 @@ signals: rules: critical: comparator: ">" + threshold: 90 lasting_duration: "15m" major: lasting_duration: "15m" comparator: ">" + threshold: 80 dependency: critical diff --git a/modules/integration_aws-efs/conf/04-throughput-write.yaml b/modules/integration_aws-efs/conf/04-throughput-write.yaml index 9fa121ed5..63cae419a 100644 --- a/modules/integration_aws-efs/conf/04-throughput-write.yaml +++ b/modules/integration_aws-efs/conf/04-throughput-write.yaml @@ -17,8 +17,10 @@ signals: rules: critical: comparator: ">" + threshold: 90 lasting_duration: "15m" major: comparator: ">" + threshold: 80 dependency: critical lasting_duration: "15m" diff --git a/modules/integration_aws-efs/variables-gen.tf b/modules/integration_aws-efs/variables-gen.tf index f63743d70..4cf22ccfb 100644 --- a/modules/integration_aws-efs/variables-gen.tf +++ b/modules/integration_aws-efs/variables-gen.tf @@ -237,6 +237,7 @@ variable "read_throughput_disabled_major" { variable "read_throughput_threshold_critical" { description = "Critical threshold for read_throughput detector in %" type = number + default = 90 } variable "read_throughput_lasting_duration_critical" { @@ -253,6 +254,7 @@ variable "read_throughput_at_least_percentage_critical" { variable "read_throughput_threshold_major" { description = "Major threshold for read_throughput detector in %" type = number + default = 80 } variable "read_throughput_lasting_duration_major" { @@ -325,6 +327,7 @@ variable "write_throughput_disabled_major" { variable "write_throughput_threshold_critical" { description = "Critical threshold for write_throughput detector in %" type = number + default = 90 } variable "write_throughput_lasting_duration_critical" { @@ -341,6 +344,7 @@ variable "write_throughput_at_least_percentage_critical" { variable "write_throughput_threshold_major" { description = "Major threshold for write_throughput detector in %" type = number + default = 80 } variable "write_throughput_lasting_duration_major" { From ef6090031ced0d7a96ac8e0af5bddd9ec48c76cf Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 16:49:04 +0100 Subject: [PATCH 06/11] feat: integration_aws-rds-common: optionalize detectors --- modules/integration_aws-rds-common/README.md | 2 +- .../conf/00-heartbeat.yaml | 1 + .../conf/01-cpu.yaml | 1 + .../conf/02-free-space.yaml | 1 + .../conf/03-replica-lag.yaml | 1 + .../conf/04-dbload.yaml | 1 + .../detectors-gen.tf | 10 +++++++ .../integration_aws-rds-common/variables.tf | 29 +++++++++++++++++++ 8 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 modules/integration_aws-rds-common/variables.tf diff --git a/modules/integration_aws-rds-common/README.md b/modules/integration_aws-rds-common/README.md index 548798557..1becc32b4 100644 --- a/modules/integration_aws-rds-common/README.md +++ b/modules/integration_aws-rds-common/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables-gen.tf](variables-gen.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. diff --git a/modules/integration_aws-rds-common/conf/00-heartbeat.yaml b/modules/integration_aws-rds-common/conf/00-heartbeat.yaml index e886f3d63..b8f8dcd2c 100644 --- a/modules/integration_aws-rds-common/conf/00-heartbeat.yaml +++ b/modules/integration_aws-rds-common/conf/00-heartbeat.yaml @@ -4,6 +4,7 @@ name: heartbeat transformation: false aggregation: ".mean(by=['DBInstanceIdentifier'])" filtering: "filter('namespace', 'AWS/RDS')" +condition: "var.heartbeat_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/01-cpu.yaml b/modules/integration_aws-rds-common/conf/01-cpu.yaml index 39a61ec3a..827a20526 100644 --- a/modules/integration_aws-rds-common/conf/01-cpu.yaml +++ b/modules/integration_aws-rds-common/conf/01-cpu.yaml @@ -7,6 +7,7 @@ aggregation: ".min(over='15m')" filtering: "filter('namespace', 'AWS/RDS')" value_unit: "%" +condition: "var.cpu_usage_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/02-free-space.yaml b/modules/integration_aws-rds-common/conf/02-free-space.yaml index c2ba75164..828673d10 100644 --- a/modules/integration_aws-rds-common/conf/02-free-space.yaml +++ b/modules/integration_aws-rds-common/conf/02-free-space.yaml @@ -7,6 +7,7 @@ aggregation: ".min(over='15m')" filtering: "filter('namespace', 'AWS/RDS')" value_unit: "Gibibyte" +condition: "var.free_space_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml index ad1c163a8..e295a1f26 100644 --- a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml +++ b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml @@ -5,6 +5,7 @@ transformation: true aggregation: ".min(over='5m')" filtering: "filter('namespace', 'AWS/RDS')" +condition: "var.replica_lag_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/conf/04-dbload.yaml b/modules/integration_aws-rds-common/conf/04-dbload.yaml index 5fb89adf9..87c0637fb 100644 --- a/modules/integration_aws-rds-common/conf/04-dbload.yaml +++ b/modules/integration_aws-rds-common/conf/04-dbload.yaml @@ -5,6 +5,7 @@ name: "DB Load" transformation: true aggregation: true filtering: "filter('namespace', 'AWS/RDS') and filter('stat', 'mean')" +condition: "var.dbload_detector_enabled" signals: signal: diff --git a/modules/integration_aws-rds-common/detectors-gen.tf b/modules/integration_aws-rds-common/detectors-gen.tf index cccaab132..1e462c569 100644 --- a/modules/integration_aws-rds-common/detectors-gen.tf +++ b/modules/integration_aws-rds-common/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "cpu_usage" { + count = (var.cpu_usage_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common instance cpu") authorized_writer_teams = var.authorized_writer_teams @@ -74,6 +78,8 @@ EOF } resource "signalfx_detector" "free_space_low" { + count = (var.free_space_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common instance free space") authorized_writer_teams = var.authorized_writer_teams @@ -120,6 +126,8 @@ EOF } resource "signalfx_detector" "replica_lag" { + count = (var.replica_lag_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common replica lag") authorized_writer_teams = var.authorized_writer_teams @@ -161,6 +169,8 @@ EOF } resource "signalfx_detector" "dbload" { + count = (var.dbload_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS RDS Common db load") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-rds-common/variables.tf b/modules/integration_aws-rds-common/variables.tf new file mode 100644 index 000000000..f64f54bda --- /dev/null +++ b/modules/integration_aws-rds-common/variables.tf @@ -0,0 +1,29 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "cpu_usage_detector_enabled" { + description = "Enable cpu usage detector" + type = bool + default = true +} + +variable "free_space_detector_enabled" { + description = "Enable free space detector" + type = bool + default = true +} + +variable "replica_lag_detector_enabled" { + description = "Enable replica lag detector" + type = bool + default = true +} + +variable "dbload_detector_enabled" { + description = "Enable dbload detector" + type = bool + default = true +} From b0f668b0aba096fc906b668188ccb4945aba54f3 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 17:00:17 +0100 Subject: [PATCH 07/11] fix: integration_aws-rds-common: replace transformation with lasting property --- modules/integration_aws-rds-common/conf/01-cpu.yaml | 3 --- modules/integration_aws-rds-common/conf/02-free-space.yaml | 2 -- modules/integration_aws-rds-common/conf/03-replica-lag.yaml | 5 ++--- modules/integration_aws-rds-common/conf/04-dbload.yaml | 2 -- modules/integration_aws-rds-common/variables-gen.tf | 6 +++--- 5 files changed, 5 insertions(+), 13 deletions(-) diff --git a/modules/integration_aws-rds-common/conf/01-cpu.yaml b/modules/integration_aws-rds-common/conf/01-cpu.yaml index 827a20526..b66f1a718 100644 --- a/modules/integration_aws-rds-common/conf/01-cpu.yaml +++ b/modules/integration_aws-rds-common/conf/01-cpu.yaml @@ -2,9 +2,6 @@ module: AWS RDS Common id: "cpu_usage" name: "Instance CPU" -transformation: true -aggregation: ".min(over='15m')" - filtering: "filter('namespace', 'AWS/RDS')" value_unit: "%" condition: "var.cpu_usage_detector_enabled" diff --git a/modules/integration_aws-rds-common/conf/02-free-space.yaml b/modules/integration_aws-rds-common/conf/02-free-space.yaml index 828673d10..22fb08c9e 100644 --- a/modules/integration_aws-rds-common/conf/02-free-space.yaml +++ b/modules/integration_aws-rds-common/conf/02-free-space.yaml @@ -3,8 +3,6 @@ id: "free_space_low" name: "Instance free space" transformation: ".scale(1/1024**3)" -aggregation: ".min(over='15m')" - filtering: "filter('namespace', 'AWS/RDS')" value_unit: "Gibibyte" condition: "var.free_space_detector_enabled" diff --git a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml index e295a1f26..3e32181ae 100644 --- a/modules/integration_aws-rds-common/conf/03-replica-lag.yaml +++ b/modules/integration_aws-rds-common/conf/03-replica-lag.yaml @@ -1,9 +1,6 @@ module: AWS RDS Common name: "Replica lag" -transformation: true -aggregation: ".min(over='5m')" - filtering: "filter('namespace', 'AWS/RDS')" condition: "var.replica_lag_detector_enabled" @@ -16,7 +13,9 @@ rules: critical: threshold: 300 comparator: ">" + duration: "5m" major: threshold: 200 comparator: ">" + duration: "5m" dependency: critical diff --git a/modules/integration_aws-rds-common/conf/04-dbload.yaml b/modules/integration_aws-rds-common/conf/04-dbload.yaml index 87c0637fb..e9446d192 100644 --- a/modules/integration_aws-rds-common/conf/04-dbload.yaml +++ b/modules/integration_aws-rds-common/conf/04-dbload.yaml @@ -2,8 +2,6 @@ module: AWS RDS Common id: dbload name: "DB Load" -transformation: true -aggregation: true filtering: "filter('namespace', 'AWS/RDS') and filter('stat', 'mean')" condition: "var.dbload_detector_enabled" diff --git a/modules/integration_aws-rds-common/variables-gen.tf b/modules/integration_aws-rds-common/variables-gen.tf index 585500e5f..3ebcaf28b 100644 --- a/modules/integration_aws-rds-common/variables-gen.tf +++ b/modules/integration_aws-rds-common/variables-gen.tf @@ -53,7 +53,7 @@ variable "cpu_usage_notifications" { variable "cpu_usage_aggregation_function" { description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "cpu_usage_transformation_function" { @@ -143,7 +143,7 @@ variable "free_space_low_notifications" { variable "free_space_low_aggregation_function" { description = "Aggregation function and group by for free_space_low detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "free_space_low_transformation_function" { @@ -233,7 +233,7 @@ variable "replica_lag_notifications" { variable "replica_lag_aggregation_function" { description = "Aggregation function and group by for replica_lag detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='5m')" + default = "" } variable "replica_lag_transformation_function" { From fdf9fdfa7cdeb785fc479117ea2a906f708527aa Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 17:18:11 +0100 Subject: [PATCH 08/11] feat: integration_aws-alb: optionalize detectors --- .../conf/00-heartbeat.yaml | 1 + .../integration_aws-alb/conf/01-latency.yaml | 1 + .../integration_aws-alb/conf/02-lb-5xx.yaml | 1 + .../integration_aws-alb/conf/03-lb-4xx.yaml | 1 + .../conf/04-target-5xx.yaml | 1 + .../conf/05-target-4xx.yaml | 1 + .../integration_aws-alb/conf/06-healthy.yaml | 1 + modules/integration_aws-alb/detectors-gen.tf | 14 +++++++ modules/integration_aws-alb/variables.tf | 42 +++++++++++++++++++ 9 files changed, 63 insertions(+) diff --git a/modules/integration_aws-alb/conf/00-heartbeat.yaml b/modules/integration_aws-alb/conf/00-heartbeat.yaml index e4bbd449d..bc8004140 100644 --- a/modules/integration_aws-alb/conf/00-heartbeat.yaml +++ b/modules/integration_aws-alb/conf/00-heartbeat.yaml @@ -4,6 +4,7 @@ name: heartbeat transformation: false aggregation: ".mean(by=['LoadBalancer'])" filtering: "filter('namespace', 'AWS/ApplicationELB')" +condition: "var.heartbeat_detector_enabled" signals: signal: diff --git a/modules/integration_aws-alb/conf/01-latency.yaml b/modules/integration_aws-alb/conf/01-latency.yaml index 5f4ea61c8..557d462ca 100644 --- a/modules/integration_aws-alb/conf/01-latency.yaml +++ b/modules/integration_aws-alb/conf/01-latency.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB')" value_unit: "Second" +condition: "var.latency_detector_enabled" signals: signal: diff --git a/modules/integration_aws-alb/conf/02-lb-5xx.yaml b/modules/integration_aws-alb/conf/02-lb-5xx.yaml index 8ef21a35e..3ecbd045e 100644 --- a/modules/integration_aws-alb/conf/02-lb-5xx.yaml +++ b/modules/integration_aws-alb/conf/02-lb-5xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.lb_5xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/03-lb-4xx.yaml b/modules/integration_aws-alb/conf/03-lb-4xx.yaml index d31fe8d70..b225ed13d 100644 --- a/modules/integration_aws-alb/conf/03-lb-4xx.yaml +++ b/modules/integration_aws-alb/conf/03-lb-4xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.lb_4xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/04-target-5xx.yaml b/modules/integration_aws-alb/conf/04-target-5xx.yaml index 8d8f3c1d9..584423b13 100644 --- a/modules/integration_aws-alb/conf/04-target-5xx.yaml +++ b/modules/integration_aws-alb/conf/04-target-5xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.target_5xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/05-target-4xx.yaml b/modules/integration_aws-alb/conf/05-target-4xx.yaml index a7d96384a..5d3888dff 100644 --- a/modules/integration_aws-alb/conf/05-target-4xx.yaml +++ b/modules/integration_aws-alb/conf/05-target-4xx.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.target_4xx_detector_enabled" signals: errors: diff --git a/modules/integration_aws-alb/conf/06-healthy.yaml b/modules/integration_aws-alb/conf/06-healthy.yaml index 4f75d0ab3..47eddfc0d 100644 --- a/modules/integration_aws-alb/conf/06-healthy.yaml +++ b/modules/integration_aws-alb/conf/06-healthy.yaml @@ -6,6 +6,7 @@ transformation: true aggregation: true filtering: "filter('namespace', 'AWS/ApplicationELB') and (not filter('AvailabilityZone', '*'))" value_unit: "%" +condition: "var.healthy_detector_enabled" signals: healthy: diff --git a/modules/integration_aws-alb/detectors-gen.tf b/modules/integration_aws-alb/detectors-gen.tf index 578805590..ed6ded754 100644 --- a/modules/integration_aws-alb/detectors-gen.tf +++ b/modules/integration_aws-alb/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "latency" { + count = (var.latency_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target response time") authorized_writer_teams = var.authorized_writer_teams @@ -74,6 +78,8 @@ EOF } resource "signalfx_detector" "alb_5xx" { + count = (var.lb_5xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB 5xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -122,6 +128,8 @@ EOF } resource "signalfx_detector" "alb_4xx" { + count = (var.lb_4xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB 4xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -183,6 +191,8 @@ EOF } resource "signalfx_detector" "target_5xx" { + count = (var.target_5xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target 5xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -231,6 +241,8 @@ EOF } resource "signalfx_detector" "target_4xx" { + count = (var.target_4xx_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB target 4xx error rate") authorized_writer_teams = var.authorized_writer_teams @@ -292,6 +304,8 @@ EOF } resource "signalfx_detector" "healthy" { + count = (var.healthy_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS ALB healthy instances percentage") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-alb/variables.tf b/modules/integration_aws-alb/variables.tf index 696da8ee1..3f74ec7e7 100644 --- a/modules/integration_aws-alb/variables.tf +++ b/modules/integration_aws-alb/variables.tf @@ -1,3 +1,45 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "latency_detector_enabled" { + description = "Enable latency detector" + type = bool + default = true +} + +variable "lb_5xx_detector_enabled" { + description = "Enable lb 5xx detector" + type = bool + default = true +} + +variable "lb_4xx_detector_enabled" { + description = "Enable lb 4xx detector" + type = bool + default = true +} + +variable "target_5xx_detector_enabled" { + description = "Enable target 5xx detector" + type = bool + default = true +} + +variable "target_4xx_detector_enabled" { + description = "Enable target 4xx detector" + type = bool + default = true +} + +variable "healthy_detector_enabled" { + description = "Enable healthy detector" + type = bool + default = true +} + # Module specific variable "minimum_traffic" { From 8ed5c87359326453784e50a93e2a6ebce6f70e56 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Fri, 10 Jan 2025 17:55:05 +0100 Subject: [PATCH 09/11] fix: integration_aws-alb: enable all alarms by default --- modules/integration_aws-alb/conf/03-lb-4xx.yaml | 1 - modules/integration_aws-alb/conf/05-target-4xx.yaml | 1 - modules/integration_aws-alb/variables-gen.tf | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/integration_aws-alb/conf/03-lb-4xx.yaml b/modules/integration_aws-alb/conf/03-lb-4xx.yaml index b225ed13d..ddcbeab2b 100644 --- a/modules/integration_aws-alb/conf/03-lb-4xx.yaml +++ b/modules/integration_aws-alb/conf/03-lb-4xx.yaml @@ -26,7 +26,6 @@ rules: lasting_duration: 15m lasting_at_least: 0.9 append_condition: and when(requests > ${var.minimum_traffic}) - disabled: true major: threshold: 95 comparator: ">" diff --git a/modules/integration_aws-alb/conf/05-target-4xx.yaml b/modules/integration_aws-alb/conf/05-target-4xx.yaml index 5d3888dff..1e6fff490 100644 --- a/modules/integration_aws-alb/conf/05-target-4xx.yaml +++ b/modules/integration_aws-alb/conf/05-target-4xx.yaml @@ -26,7 +26,6 @@ rules: lasting_duration: 15m lasting_at_least: 0.9 append_condition: and when(requests > ${var.minimum_traffic}) - disabled: true major: threshold: 95 comparator: ">" diff --git a/modules/integration_aws-alb/variables-gen.tf b/modules/integration_aws-alb/variables-gen.tf index 83af98b31..cf7730c4b 100644 --- a/modules/integration_aws-alb/variables-gen.tf +++ b/modules/integration_aws-alb/variables-gen.tf @@ -269,7 +269,7 @@ variable "alb_4xx_disabled" { variable "alb_4xx_disabled_critical" { description = "Disable critical alerting rule for alb_4xx detector" type = bool - default = true + default = null } variable "alb_4xx_disabled_major" { @@ -472,7 +472,7 @@ variable "target_4xx_disabled" { variable "target_4xx_disabled_critical" { description = "Disable critical alerting rule for target_4xx detector" type = bool - default = true + default = null } variable "target_4xx_disabled_major" { From dbff769820a914a1f77c08b816df0b047a4d8044 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Thu, 30 Jan 2025 11:59:07 +0100 Subject: [PATCH 10/11] feat: integration_aws-nlb: optionalize detectors --- modules/integration_aws-nlb/README.md | 2 +- modules/integration_aws-nlb/conf/00-heartbeat.yaml | 3 ++- .../conf/01-no_healthy_instances.yaml | 1 + modules/integration_aws-nlb/detectors-gen.tf | 4 ++++ modules/integration_aws-nlb/variables.tf | 11 +++++++++++ 5 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 modules/integration_aws-nlb/variables.tf diff --git a/modules/integration_aws-nlb/README.md b/modules/integration_aws-nlb/README.md index 1f609e5b3..783174628 100644 --- a/modules/integration_aws-nlb/README.md +++ b/modules/integration_aws-nlb/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables-gen.tf](variables-gen.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. diff --git a/modules/integration_aws-nlb/conf/00-heartbeat.yaml b/modules/integration_aws-nlb/conf/00-heartbeat.yaml index 6410e216e..191a0eec9 100644 --- a/modules/integration_aws-nlb/conf/00-heartbeat.yaml +++ b/modules/integration_aws-nlb/conf/00-heartbeat.yaml @@ -4,9 +4,10 @@ name: "heartbeat" transformation: true aggregation: ".mean(by=['LoadBalancer'])" filtering: "filter('stat', 'mean') and filter('namespace', 'AWS/NetworkELB')" +condition: "var.heartbeat_detector_enabled" signals: signal: metric: "ConsumedLCUs" rules: - critical: \ No newline at end of file + critical: diff --git a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml index 6d4391355..f2da20d12 100644 --- a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml +++ b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml @@ -5,6 +5,7 @@ id: "no_healthy_instances" transformation: ".min(over='5m')" aggregation: true filtering: "filter('namespace', 'AWS/NetworkELB') and (not filter('AvailabilityZone', '*'))" +condition: "var.healthy_instances_detector_enabled" signals: A: diff --git a/modules/integration_aws-nlb/detectors-gen.tf b/modules/integration_aws-nlb/detectors-gen.tf index d53fc1cf6..1936b26ec 100644 --- a/modules/integration_aws-nlb/detectors-gen.tf +++ b/modules/integration_aws-nlb/detectors-gen.tf @@ -1,4 +1,6 @@ resource "signalfx_detector" "heartbeat" { + count = (var.heartbeat_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS NLB heartbeat") authorized_writer_teams = var.authorized_writer_teams @@ -28,6 +30,8 @@ EOF } resource "signalfx_detector" "no_healthy_instances" { + count = (var.healthy_instances_detector_enabled) ? 1 : 0 + name = format("%s %s", local.detector_name_prefix, "AWS NLB healthy instances percentage") authorized_writer_teams = var.authorized_writer_teams diff --git a/modules/integration_aws-nlb/variables.tf b/modules/integration_aws-nlb/variables.tf new file mode 100644 index 000000000..7161f5ba1 --- /dev/null +++ b/modules/integration_aws-nlb/variables.tf @@ -0,0 +1,11 @@ +variable "heartbeat_detector_enabled" { + description = "Enable heartbeat detector" + type = bool + default = true +} + +variable "healthy_instances_detector_enabled" { + description = "Enable healthy instances detector" + type = bool + default = true +} From f7fe62b3ba32572ce2762645c898fdebc1c748d8 Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Thu, 30 Jan 2025 12:00:30 +0100 Subject: [PATCH 11/11] fix: integration_aws-nlb: replace transformation with lasting property --- .../integration_aws-nlb/conf/01-no_healthy_instances.yaml | 7 ++++--- modules/integration_aws-nlb/variables-gen.tf | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml index f2da20d12..085ae2599 100644 --- a/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml +++ b/modules/integration_aws-nlb/conf/01-no_healthy_instances.yaml @@ -2,7 +2,6 @@ module: "AWS NLB" name: "Healthy instances percentage" id: "no_healthy_instances" -transformation: ".min(over='5m')" aggregation: true filtering: "filter('namespace', 'AWS/NetworkELB') and (not filter('AvailabilityZone', '*'))" condition: "var.healthy_instances_detector_enabled" @@ -21,8 +20,10 @@ rules: critical: threshold: 1 comparator: "<" - + lasting_duration: "5min" + major: threshold: 100 comparator: "<" - dependency: "critical" \ No newline at end of file + dependency: "critical" + lasting_duration: "5min" diff --git a/modules/integration_aws-nlb/variables-gen.tf b/modules/integration_aws-nlb/variables-gen.tf index da6880eec..1555c87dc 100644 --- a/modules/integration_aws-nlb/variables-gen.tf +++ b/modules/integration_aws-nlb/variables-gen.tf @@ -65,7 +65,7 @@ variable "no_healthy_instances_aggregation_function" { variable "no_healthy_instances_transformation_function" { description = "Transformation function for no_healthy_instances detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='5m')" + default = "" } variable "no_healthy_instances_max_delay" { @@ -113,7 +113,7 @@ variable "no_healthy_instances_threshold_critical" { variable "no_healthy_instances_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5min" } variable "no_healthy_instances_at_least_percentage_critical" { @@ -130,7 +130,7 @@ variable "no_healthy_instances_threshold_major" { variable "no_healthy_instances_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5min" } variable "no_healthy_instances_at_least_percentage_major" {