Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -1212,7 +1212,6 @@
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Nginx heartbeat|X|-|-|-|-|
|Nginx dropped connections|X|X|-|-|-|


## smart-agent_ntp
Expand Down
4 changes: 1 addition & 3 deletions modules/smart-agent_nginx/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ This module creates the following SignalFx detectors which could contain one or
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Nginx heartbeat|X|-|-|-|-|
|Nginx dropped connections|X|X|-|-|-|

## How to collect required metrics?

Expand Down Expand Up @@ -117,8 +116,7 @@ parameter to the corresponding monitor configuration:
datapointsToExclude:
- metricNames:
- '*'
- '!connections.failed'
- '!nginx_connections.reading'
- '!nginx.connections_current'

```

Expand Down
43 changes: 1 addition & 42 deletions modules/smart-agent_nginx/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ resource "signalfx_detector" "heartbeat" {

program_text = <<-EOF
from signalfx.detectors.not_reporting import not_reporting
signal = data('nginx_connections.reading', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
signal = data('nginx.connections_current', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT')
EOF

Expand All @@ -25,44 +25,3 @@ EOF

max_delay = var.heartbeat_max_delay
}

resource "signalfx_detector" "dropped_connections" {
name = format("%s %s", local.detector_name_prefix, "Nginx dropped connections")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('connections.failed', filter=${module.filtering.signalflow})${var.dropped_connections_aggregation_function}${var.dropped_connections_transformation_function}.publish('signal')
detect(when(signal > ${var.dropped_connections_threshold_critical}%{if var.dropped_connections_lasting_duration_critical != null}, lasting='${var.dropped_connections_lasting_duration_critical}', at_least=${var.dropped_connections_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal > ${var.dropped_connections_threshold_major}%{if var.dropped_connections_lasting_duration_major != null}, lasting='${var.dropped_connections_lasting_duration_major}', at_least=${var.dropped_connections_at_least_percentage_major}%{endif}) and (not when(signal > ${var.dropped_connections_threshold_critical}%{if var.dropped_connections_lasting_duration_critical != null}, lasting='${var.dropped_connections_lasting_duration_critical}', at_least=${var.dropped_connections_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.dropped_connections_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.dropped_connections_disabled_critical, var.dropped_connections_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.dropped_connections_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.dropped_connections_runbook_url, var.runbook_url), "")
tip = var.dropped_connections_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.dropped_connections_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.dropped_connections_disabled_major, var.dropped_connections_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.dropped_connections_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.dropped_connections_runbook_url, var.runbook_url), "")
tip = var.dropped_connections_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.dropped_connections_max_delay
}

5 changes: 0 additions & 5 deletions modules/smart-agent_nginx/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
output "dropped_connections" {
description = "Detector resource for dropped_connections"
value = signalfx_detector.dropped_connections
}

output "heartbeat" {
description = "Detector resource for heartbeat"
value = signalfx_detector.heartbeat
Expand Down
91 changes: 0 additions & 91 deletions modules/smart-agent_nginx/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,94 +53,3 @@ variable "heartbeat_timeframe" {
type = string
default = "25m"
}

# dropped_connections detector

variable "dropped_connections_notifications" {
description = "Notification recipients list per severity overridden for dropped_connections detector"
type = map(list(string))
default = {}
}

variable "dropped_connections_aggregation_function" {
description = "Aggregation function and group by for dropped_connections detector (i.e. \".mean(by=['host'])\")"
type = string
default = ""
}

variable "dropped_connections_transformation_function" {
description = "Transformation function for dropped_connections detector (i.e. \".mean(over='5m')\")"
type = string
default = ".min(over='5m')"
}

variable "dropped_connections_max_delay" {
description = "Enforce max delay for dropped_connections detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "dropped_connections_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "dropped_connections_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "dropped_connections_disabled" {
description = "Disable all alerting rules for dropped_connections detector"
type = bool
default = null
}

variable "dropped_connections_disabled_critical" {
description = "Disable critical alerting rule for dropped_connections detector"
type = bool
default = null
}

variable "dropped_connections_disabled_major" {
description = "Disable major alerting rule for dropped_connections detector"
type = bool
default = null
}

variable "dropped_connections_threshold_critical" {
description = "Critical threshold for dropped_connections detector"
type = number
default = 1
}

variable "dropped_connections_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "dropped_connections_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "dropped_connections_threshold_major" {
description = "Major threshold for dropped_connections detector"
type = number
default = 0
}

variable "dropped_connections_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "dropped_connections_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
Loading