Skip to content

Commit ff6e099

Browse files
GCP quota alerts via pulumi (#4296)
* refactor alertStrategy * add quotas alert policy * off by default in base.yaml * remove 0 quotas * work around google API limitation googleapi: Error 400: Alert policies with "prometheus_query_language" condition type can only have a single condition.: provider=google-beta@9.10.0 * fix aligner Field aggregation.perSeriesAligner had an invalid value of "ALIGN_SUM": The aligner cannot be applied to metrics with kind GAUGE and value type BOOL. * threshold variable; add cluster label * type-check baseArgs --------- Signed-off-by: Stephen Compall <stephen.compall@digitalasset.com>
1 parent 8d79788 commit ff6e099

File tree

9 files changed

+125
-36
lines changed

9 files changed

+125
-36
lines changed

cluster/configs/shared/base.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ monitoring:
124124
sequencerRateLimits:
125125
rejectionRateThreshold: 0
126126
circuitBreakerStateThreshold: 0.5
127+
gcpQuotas:
128+
enabled: false
127129
# Alert on secrets appearing in logs (JWTs, Bearer tokens, passwords, etc.)
128130
# Key=value secrets (excluding masked values)
129131
# JWTs identified by base64 header prefix "eyJhbGc" (decodes to '{"alg')

cluster/deployment/scratchneta/config.resolved.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ monitoring:
5252
thresholdPerNamespace: 0.05
5353
deployment:
5454
pendingPeriodMinutes: 5
55+
gcpQuotas:
56+
enabled: false
5557
ingestion:
5658
thresholdEntriesPerBatch: 80
5759
loadTester:

cluster/deployment/scratchnetb/config.resolved.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ monitoring:
5252
thresholdPerNamespace: 0.05
5353
deployment:
5454
pendingPeriodMinutes: 5
55+
gcpQuotas:
56+
enabled: false
5557
ingestion:
5658
thresholdEntriesPerBatch: 80
5759
loadTester:

cluster/deployment/scratchnetc/config.resolved.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ monitoring:
5252
thresholdPerNamespace: 0.05
5353
deployment:
5454
pendingPeriodMinutes: 5
55+
gcpQuotas:
56+
enabled: false
5557
ingestion:
5658
thresholdEntriesPerBatch: 80
5759
loadTester:

cluster/deployment/scratchnetd/config.resolved.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ monitoring:
5252
thresholdPerNamespace: 0.05
5353
deployment:
5454
pendingPeriodMinutes: 5
55+
gcpQuotas:
56+
enabled: false
5557
ingestion:
5658
thresholdEntriesPerBatch: 80
5759
loadTester:

cluster/deployment/scratchnete/config.resolved.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ monitoring:
5252
thresholdPerNamespace: 0.05
5353
deployment:
5454
pendingPeriodMinutes: 5
55+
gcpQuotas:
56+
enabled: false
5557
ingestion:
5658
thresholdEntriesPerBatch: 80
5759
loadTester:

cluster/pulumi/infra/src/config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ const MonitoringConfigSchema = z
7676
rejectionRateThreshold: z.number(),
7777
circuitBreakerStateThreshold: z.number(),
7878
}),
79+
gcpQuotas: z.object({
80+
enabled: z.boolean(),
81+
}),
7982
}),
8083
logAlerts: z.object({}).catchall(z.string()).default({}),
8184
loggedSecretsFilter: z.string().optional(),

cluster/pulumi/infra/src/gcpAlerts.ts

Lines changed: 106 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ export function getNotificationChannel(
3838
: undefined;
3939
}
4040

41+
function getAlertStrategy(notificationChannel: gcp.monitoring.NotificationChannel) {
42+
return {
43+
autoClose: '3600s',
44+
notificationChannelStrategies: [
45+
{
46+
notificationChannelNames: [notificationChannel.name],
47+
renotifyInterval: `${4 * 60 * 60}s`, // 4 hours
48+
},
49+
],
50+
};
51+
}
52+
4153
export function installGcpLoggingAlerts(
4254
notificationChannel: gcp.monitoring.NotificationChannel
4355
): void {
@@ -81,15 +93,7 @@ ${Object.keys(logAlerts)
8193
const alertCount = enableChaosMesh ? 50 : 1;
8294
const displayName = `Log warnings and errors > ${alertCount} ${CLUSTER_BASENAME}`;
8395
new gcp.monitoring.AlertPolicy('logsAlert', {
84-
alertStrategy: {
85-
autoClose: '3600s',
86-
notificationChannelStrategies: [
87-
{
88-
notificationChannelNames: [notificationChannel.name],
89-
renotifyInterval: `${4 * 60 * 60}s`, // 4 hours
90-
},
91-
],
92-
},
96+
alertStrategy: getAlertStrategy(notificationChannel),
9397
combiner: 'OR',
9498
conditions: [
9599
{
@@ -156,15 +160,7 @@ ${ensureTrailingNewline(loggedSecretsFilter)}`;
156160

157161
const displayName = `Logged secrets detected in ${CLUSTER_BASENAME}`;
158162
new gcp.monitoring.AlertPolicy('loggedSecretsAlert', {
159-
alertStrategy: {
160-
autoClose: '3600s',
161-
notificationChannelStrategies: [
162-
{
163-
notificationChannelNames: [notificationChannel.name],
164-
renotifyInterval: `${4 * 60 * 60}s`, // 4 hours
165-
},
166-
],
167-
},
163+
alertStrategy: getAlertStrategy(notificationChannel),
168164
combiner: 'OR',
169165
conditions: [
170166
{
@@ -222,15 +218,7 @@ jsonPayload.state=~"STARTED"`,
222218

223219
const displayName = `Cluster ${CLUSTER_BASENAME} is being updated`;
224220
new gcp.monitoring.AlertPolicy('updateClusterAlert', {
225-
alertStrategy: {
226-
autoClose: '3600s',
227-
notificationChannelStrategies: [
228-
{
229-
notificationChannelNames: [notificationChannel.name],
230-
renotifyInterval: `${4 * 60 * 60}s`, // 4 hours
231-
},
232-
],
233-
},
221+
alertStrategy: getAlertStrategy(notificationChannel),
234222
combiner: 'OR',
235223
conditions: [
236224
{
@@ -273,15 +261,7 @@ resource.type="cloudsql_database"
273261

274262
const displayName = `Possible CloudSQL maintenance going on in ${CLUSTER_BASENAME}`;
275263
new gcp.monitoring.AlertPolicy('updateCloudSQLAlert', {
276-
alertStrategy: {
277-
autoClose: '3600s',
278-
notificationChannelStrategies: [
279-
{
280-
notificationChannelNames: [notificationChannel.name],
281-
renotifyInterval: `${4 * 60 * 60}s`, // 4 hours
282-
},
283-
],
284-
},
264+
alertStrategy: getAlertStrategy(notificationChannel),
285265
combiner: 'OR',
286266
conditions: [
287267
{
@@ -309,3 +289,93 @@ resource.type="cloudsql_database"
309289
notificationChannels: [notificationChannel.name],
310290
});
311291
}
292+
293+
export function installGcpQuotaAlerts(
294+
notificationChannel: gcp.monitoring.NotificationChannel
295+
): void {
296+
const quotaUsageThreshold = 0.9;
297+
const quotaUsageThresholdPercent = quotaUsageThreshold * 100;
298+
299+
const baseArgs: Pick<
300+
gcp.monitoring.AlertPolicyArgs,
301+
'alertStrategy' | 'combiner' | 'notificationChannels' | 'userLabels'
302+
> = {
303+
alertStrategy: getAlertStrategy(notificationChannel),
304+
combiner: 'OR',
305+
notificationChannels: [notificationChannel.name],
306+
userLabels: { cluster: CLUSTER_BASENAME },
307+
};
308+
309+
new gcp.monitoring.AlertPolicy('quotaExceededAlert', {
310+
...baseArgs,
311+
displayName: `Quota Exceeded in ${CLUSTER_BASENAME}`,
312+
conditions: [
313+
{
314+
// "Quota Full" (Exceeded right now)
315+
displayName: `Quota Exceeded in ${CLUSTER_BASENAME}`,
316+
conditionThreshold: {
317+
aggregations: [
318+
{
319+
alignmentPeriod: '60s',
320+
crossSeriesReducer: 'REDUCE_SUM',
321+
groupByFields: ['metric.label.quota_metric'],
322+
perSeriesAligner: 'ALIGN_COUNT_TRUE',
323+
},
324+
],
325+
comparison: 'COMPARISON_GT',
326+
duration: '60s',
327+
filter:
328+
'resource.type="consumer_quota" AND metric.type="serviceruntime.googleapis.com/quota/exceeded"',
329+
trigger: {
330+
count: 1,
331+
},
332+
},
333+
},
334+
],
335+
});
336+
337+
// Allocation quotas track consumed capacity (for example CPUs, IPs, disk)
338+
// against fixed limits, while rate quotas track request throughput over time
339+
// windows (for example API calls per minute). These are tracked separately so
340+
// we have separate alerts for them
341+
342+
new gcp.monitoring.AlertPolicy('quotaAllocationAlert', {
343+
...baseArgs,
344+
displayName: `Allocation Quota approaching limit (>${quotaUsageThresholdPercent}%) in ${CLUSTER_BASENAME}`,
345+
conditions: [
346+
{
347+
// Tracks resources like CPUs, Static IPs, Disk Space
348+
displayName: `Allocation Quota approaching limit (>${quotaUsageThresholdPercent}%) in ${CLUSTER_BASENAME}`,
349+
conditionPrometheusQueryLanguage: {
350+
query: `
351+
serviceruntime_googleapis_com:quota_allocation_usage{monitored_resource="consumer_quota"}
352+
/ ignoring(limit_name) group_right()
353+
(serviceruntime_googleapis_com:quota_limit{monitored_resource="consumer_quota"} > 0)
354+
> ${quotaUsageThreshold}
355+
`,
356+
duration: '300s',
357+
},
358+
},
359+
],
360+
});
361+
362+
new gcp.monitoring.AlertPolicy('quotaRateAlert', {
363+
...baseArgs,
364+
displayName: `Rate Quota approaching limit (>${quotaUsageThresholdPercent}%) in ${CLUSTER_BASENAME}`,
365+
conditions: [
366+
{
367+
// Tracks API requests, HSM operations per minute, etc.
368+
displayName: `Rate Quota approaching limit (>${quotaUsageThresholdPercent}%) in ${CLUSTER_BASENAME}`,
369+
conditionPrometheusQueryLanguage: {
370+
query: `
371+
serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}
372+
/ ignoring(limit_name) group_right()
373+
(serviceruntime_googleapis_com:quota_limit{monitored_resource="consumer_quota"} > 0)
374+
> ${quotaUsageThreshold}
375+
`,
376+
duration: '300s',
377+
},
378+
},
379+
],
380+
});
381+
}

cluster/pulumi/infra/src/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
installGcpLoggingAlerts,
2323
installClusterMaintenanceUpdateAlerts,
2424
installLoggedSecretsAlerts,
25+
installGcpQuotaAlerts,
2526
} from './gcpAlerts';
2627
import { configureGKEL7Gateway } from './gcpLoadBalancer';
2728
import { configureIstio, istioMonitoring } from './istio';
@@ -82,6 +83,9 @@ if (enableAlerts && !clusterIsResetPeriodically) {
8283
if (monitoringConfig.alerting.loggedSecretsFilter) {
8384
installLoggedSecretsAlerts(notificationChannel);
8485
}
86+
if (monitoringConfig.alerting.alerts.gcpQuotas.enabled) {
87+
installGcpQuotaAlerts(notificationChannel);
88+
}
8589
}
8690
}
8791
istioMonitoring(network.ingressNs, []);

0 commit comments

Comments
 (0)