|
1 | 1 | { |
2 | | - prometheusAlerts+:: { |
| 2 | + new(this): { |
3 | 3 | groups+: [ |
4 | 4 | { |
5 | | - name: $._config.uid + '-alerts', |
| 5 | + name: this.config.uid + '-alerts', |
6 | 6 | rules: [ |
7 | 7 | { |
8 | 8 | alert: 'OpenSearchYellowCluster', |
9 | 9 | expr: ||| |
10 | 10 | opensearch_cluster_status{%(filteringSelector)s} == 1 |
11 | | - ||| % $._config, |
| 11 | + ||| % this.config, |
12 | 12 | 'for': '5m', |
13 | 13 | labels: { |
14 | 14 | severity: 'warning', |
|
18 | 18 | description: |
19 | 19 | ( |
20 | 20 | '{{$labels.cluster}} health status is yellow over the last 5 minutes' |
21 | | - ) % $._config, |
| 21 | + ) % this.config, |
22 | 22 | }, |
23 | 23 | }, |
24 | 24 | { |
25 | 25 | alert: 'OpenSearchRedCluster', |
26 | 26 | expr: ||| |
27 | 27 | opensearch_cluster_status{%(filteringSelector)s} == 2 |
28 | | - ||| % $._config, |
| 28 | + ||| % this.config, |
29 | 29 | 'for': '5m', |
30 | 30 | labels: { |
31 | 31 | severity: 'critical', |
|
35 | 35 | description: |
36 | 36 | ( |
37 | 37 | '{{$labels.cluster}} health status is red over the last 5 minutes' |
38 | | - ) % $._config, |
| 38 | + ) % this.config, |
39 | 39 | }, |
40 | 40 | }, |
41 | 41 | { |
42 | 42 | alert: 'OpenSearchUnstableShardReallocation', |
43 | 43 | expr: ||| |
44 | | - sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="relocating"}) > %(alertsWarningShardReallocations)s |
45 | | - ||| % $._config, |
| 44 | + sum without(type) (opensearch_cluster_shards_number{type="relocating", %(filteringSelector)s}) > %(alertsWarningShardReallocations)s |
| 45 | + ||| % this.config, |
46 | 46 | 'for': '1m', |
47 | 47 | labels: { |
48 | 48 | severity: 'warning', |
|
51 | 51 | summary: 'A node has gone offline or has been disconnected triggering shard reallocation.', |
52 | 52 | description: ||| |
53 | 53 | {{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard reallocation over the last 1m which is above the threshold of %(alertsWarningShardReallocations)s. |
54 | | - ||| % $._config, |
| 54 | + ||| % this.config, |
55 | 55 | }, |
56 | 56 | }, |
57 | 57 | { |
58 | 58 | alert: 'OpenSearchUnstableShardUnassigned', |
59 | 59 | expr: ||| |
60 | | - sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="unassigned"}) > %(alertsWarningShardUnassigned)s |
61 | | - ||| % $._config, |
| 60 | + sum without(type) (opensearch_cluster_shards_number{type="unassigned", %(filteringSelector)s}) > %(alertsWarningShardUnassigned)s |
| 61 | + ||| % this.config, |
62 | 62 | 'for': '5m', |
63 | 63 | labels: { |
64 | 64 | severity: 'warning', |
|
67 | 67 | summary: 'There are shards that have been detected as unassigned.', |
68 | 68 | description: ||| |
69 | 69 | {{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard unassigned over the last 5m which is above the threshold of %(alertsWarningShardUnassigned)s. |
70 | | - ||| % $._config, |
| 70 | + ||| % this.config, |
71 | 71 | }, |
72 | 72 | }, |
73 | 73 | { |
74 | 74 | alert: 'OpenSearchHighNodeDiskUsage', |
75 | 75 | expr: ||| |
76 | 76 | 100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsWarningDiskUsage)s |
77 | | - ||| % $._config, |
| 77 | + ||| % this.config, |
78 | 78 | 'for': '5m', |
79 | 79 | labels: { |
80 | 80 | severity: 'warning', |
|
83 | 83 | summary: 'The node disk usage has exceeded the warning threshold.', |
84 | 84 | description: ||| |
85 | 85 | {{$labels.node}} has had {{ printf "%%.0f" $value }} disk usage over the last 5m which is above the threshold of %(alertsWarningDiskUsage)s. |
86 | | - ||| % $._config, |
| 86 | + ||| % this.config, |
87 | 87 | }, |
88 | 88 | }, |
89 | 89 | { |
90 | 90 | alert: 'OpenSearchHighNodeDiskUsage', |
91 | 91 | expr: ||| |
92 | 92 | 100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsCriticalDiskUsage)s |
93 | | - ||| % $._config, |
| 93 | + ||| % this.config, |
94 | 94 | 'for': '5m', |
95 | 95 | labels: { |
96 | 96 | severity: 'critical', |
|
99 | 99 | summary: 'The node disk usage has exceeded the critical threshold.', |
100 | 100 | description: ||| |
101 | 101 | {{$labels.node}} has had {{ printf "%%.0f" $value }}%% disk usage over the last 5m which is above the threshold of %(alertsCriticalDiskUsage)s. |
102 | | - ||| % $._config, |
| 102 | + ||| % this.config, |
103 | 103 | }, |
104 | 104 | }, |
105 | 105 | { |
106 | 106 | alert: 'OpenSearchHighNodeCpuUsage', |
107 | 107 | expr: ||| |
108 | 108 | sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsWarningCPUUsage)s |
109 | | - ||| % $._config, |
| 109 | + ||| % this.config, |
110 | 110 | 'for': '5m', |
111 | 111 | labels: { |
112 | 112 | severity: 'warning', |
|
115 | 115 | summary: 'The node CPU usage has exceeded the warning threshold.', |
116 | 116 | description: ||| |
117 | 117 | {{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsWarningCPUUsage)s. |
118 | | - ||| % $._config, |
| 118 | + ||| % this.config, |
119 | 119 | }, |
120 | 120 | }, |
121 | 121 | { |
122 | 122 | alert: 'OpenSearchHighNodeCpuUsage', |
123 | 123 | expr: ||| |
124 | 124 | sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsCriticalCPUUsage)s |
125 | | - ||| % $._config, |
| 125 | + ||| % this.config, |
126 | 126 | 'for': '5m', |
127 | 127 | labels: { |
128 | 128 | severity: 'critical', |
|
131 | 131 | summary: 'The node CPU usage has exceeded the critical threshold.', |
132 | 132 | description: ||| |
133 | 133 | {{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsCriticalCPUUsage)s. |
134 | | - ||| % $._config, |
| 134 | + ||| % this.config, |
135 | 135 | }, |
136 | 136 | }, |
137 | 137 | { |
138 | 138 | alert: 'OpenSearchHighNodeMemoryUsage', |
139 | 139 | expr: ||| |
140 | 140 | sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsWarningMemoryUsage)s |
141 | | - ||| % $._config, |
| 141 | + ||| % this.config, |
142 | 142 | 'for': '5m', |
143 | 143 | labels: { |
144 | 144 | severity: 'warning', |
|
147 | 147 | summary: 'The node memory usage has exceeded the warning threshold.', |
148 | 148 | description: ||| |
149 | 149 | {{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsWarningMemoryUsage)s. |
150 | | - ||| % $._config, |
| 150 | + ||| % this.config, |
151 | 151 | }, |
152 | 152 | }, |
153 | 153 | { |
154 | 154 | alert: 'OpenSearchHighNodeMemoryUsage', |
155 | 155 | expr: ||| |
156 | 156 | sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsCriticalMemoryUsage)s |
157 | | - ||| % $._config, |
| 157 | + ||| % this.config, |
158 | 158 | 'for': '5m', |
159 | 159 | labels: { |
160 | 160 | severity: 'critical', |
|
163 | 163 | summary: 'The node memory usage has exceeded the critical threshold.', |
164 | 164 | description: ||| |
165 | 165 | {{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsCriticalMemoryUsage)s. |
166 | | - ||| % $._config, |
| 166 | + ||| % this.config, |
167 | 167 | }, |
168 | 168 | }, |
169 | 169 | { |
170 | 170 | alert: 'OpenSearchModerateRequestLatency', |
171 | 171 | expr: ||| |
172 | | - sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{%(filteringSelector)s, context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s |
173 | | - ||| % $._config, |
| 172 | + sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{context="total", %(filteringSelector)s}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s / 1000 |
| 173 | + ||| % this.config, |
174 | 174 | 'for': '5m', |
175 | 175 | labels: { |
176 | 176 | severity: 'warning', |
177 | 177 | }, |
178 | 178 | annotations: { |
179 | 179 | summary: 'The request latency has exceeded the warning threshold.', |
180 | 180 | description: ||| |
181 | | - {{$labels.index}} has had {{ printf "%%.0f" $value }}s of request latency over the last 5m which is above the threshold of %(alertsWarningRequestLatency)s. |
182 | | - ||| % $._config, |
| 181 | + {{$labels.index}} has had {{ printf "%%.0f" $value }}s of request latency over the last 5m which is above the threshold of %(alertsWarningRequestLatency)sms. |
| 182 | + ||| % this.config, |
183 | 183 | }, |
184 | 184 | }, |
185 | 185 | { |
186 | 186 | alert: 'OpenSearchModerateIndexLatency', |
187 | 187 | expr: ||| |
188 | | - sum without(context) (increase(opensearch_index_indexing_index_time_seconds{%(filteringSelector)s, context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s |
189 | | - ||| % $._config, |
| 188 | + sum without(context) (increase(opensearch_index_indexing_index_time_seconds{context="total", %(filteringSelector)s}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s / 1000 |
| 189 | + ||| % this.config, |
190 | 190 | 'for': '5m', |
191 | 191 | labels: { |
192 | 192 | severity: 'warning', |
193 | 193 | }, |
194 | 194 | annotations: { |
195 | 195 | summary: 'The index latency has exceeded the warning threshold.', |
196 | 196 | description: ||| |
197 | | - {{$labels.index}} has had {{ printf "%%.0f" $value }}s of index latency over the last 5m which is above the threshold of %(alertsWarningIndexLatency)s. |
198 | | - ||| % $._config, |
| 197 | + {{$labels.index}} has had {{ printf "%%.0f" $value }}s of index latency over the last 5m which is above the threshold of %(alertsWarningIndexLatency)sms. |
| 198 | + ||| % this.config, |
199 | 199 | }, |
200 | 200 | }, |
201 | 201 | ], |
|
0 commit comments