@@ -38,6 +38,18 @@ export function getNotificationChannel(
3838 : undefined ;
3939}
4040
41+ function getAlertStrategy ( notificationChannel : gcp . monitoring . NotificationChannel ) {
42+ return {
43+ autoClose : '3600s' ,
44+ notificationChannelStrategies : [
45+ {
46+ notificationChannelNames : [ notificationChannel . name ] ,
47+ renotifyInterval : `${ 4 * 60 * 60 } s` , // 4 hours
48+ } ,
49+ ] ,
50+ } ;
51+ }
52+
4153export function installGcpLoggingAlerts (
4254 notificationChannel : gcp . monitoring . NotificationChannel
4355) : void {
@@ -81,15 +93,7 @@ ${Object.keys(logAlerts)
8193 const alertCount = enableChaosMesh ? 50 : 1 ;
8294 const displayName = `Log warnings and errors > ${ alertCount } ${ CLUSTER_BASENAME } ` ;
8395 new gcp . monitoring . AlertPolicy ( 'logsAlert' , {
84- alertStrategy : {
85- autoClose : '3600s' ,
86- notificationChannelStrategies : [
87- {
88- notificationChannelNames : [ notificationChannel . name ] ,
89- renotifyInterval : `${ 4 * 60 * 60 } s` , // 4 hours
90- } ,
91- ] ,
92- } ,
96+ alertStrategy : getAlertStrategy ( notificationChannel ) ,
9397 combiner : 'OR' ,
9498 conditions : [
9599 {
@@ -156,15 +160,7 @@ ${ensureTrailingNewline(loggedSecretsFilter)}`;
156160
157161 const displayName = `Logged secrets detected in ${ CLUSTER_BASENAME } ` ;
158162 new gcp . monitoring . AlertPolicy ( 'loggedSecretsAlert' , {
159- alertStrategy : {
160- autoClose : '3600s' ,
161- notificationChannelStrategies : [
162- {
163- notificationChannelNames : [ notificationChannel . name ] ,
164- renotifyInterval : `${ 4 * 60 * 60 } s` , // 4 hours
165- } ,
166- ] ,
167- } ,
163+ alertStrategy : getAlertStrategy ( notificationChannel ) ,
168164 combiner : 'OR' ,
169165 conditions : [
170166 {
@@ -222,15 +218,7 @@ jsonPayload.state=~"STARTED"`,
222218
223219 const displayName = `Cluster ${ CLUSTER_BASENAME } is being updated` ;
224220 new gcp . monitoring . AlertPolicy ( 'updateClusterAlert' , {
225- alertStrategy : {
226- autoClose : '3600s' ,
227- notificationChannelStrategies : [
228- {
229- notificationChannelNames : [ notificationChannel . name ] ,
230- renotifyInterval : `${ 4 * 60 * 60 } s` , // 4 hours
231- } ,
232- ] ,
233- } ,
221+ alertStrategy : getAlertStrategy ( notificationChannel ) ,
234222 combiner : 'OR' ,
235223 conditions : [
236224 {
@@ -273,15 +261,7 @@ resource.type="cloudsql_database"
273261
274262 const displayName = `Possible CloudSQL maintenance going on in ${ CLUSTER_BASENAME } ` ;
275263 new gcp . monitoring . AlertPolicy ( 'updateCloudSQLAlert' , {
276- alertStrategy : {
277- autoClose : '3600s' ,
278- notificationChannelStrategies : [
279- {
280- notificationChannelNames : [ notificationChannel . name ] ,
281- renotifyInterval : `${ 4 * 60 * 60 } s` , // 4 hours
282- } ,
283- ] ,
284- } ,
264+ alertStrategy : getAlertStrategy ( notificationChannel ) ,
285265 combiner : 'OR' ,
286266 conditions : [
287267 {
@@ -309,3 +289,93 @@ resource.type="cloudsql_database"
309289 notificationChannels : [ notificationChannel . name ] ,
310290 } ) ;
311291}
292+
293+ export function installGcpQuotaAlerts (
294+ notificationChannel : gcp . monitoring . NotificationChannel
295+ ) : void {
296+ const quotaUsageThreshold = 0.9 ;
297+ const quotaUsageThresholdPercent = quotaUsageThreshold * 100 ;
298+
299+ const baseArgs : Pick <
300+ gcp . monitoring . AlertPolicyArgs ,
301+ 'alertStrategy' | 'combiner' | 'notificationChannels' | 'userLabels'
302+ > = {
303+ alertStrategy : getAlertStrategy ( notificationChannel ) ,
304+ combiner : 'OR' ,
305+ notificationChannels : [ notificationChannel . name ] ,
306+ userLabels : { cluster : CLUSTER_BASENAME } ,
307+ } ;
308+
309+ new gcp . monitoring . AlertPolicy ( 'quotaExceededAlert' , {
310+ ...baseArgs ,
311+ displayName : `Quota Exceeded in ${ CLUSTER_BASENAME } ` ,
312+ conditions : [
313+ {
314+ // "Quota Full" (Exceeded right now)
315+ displayName : `Quota Exceeded in ${ CLUSTER_BASENAME } ` ,
316+ conditionThreshold : {
317+ aggregations : [
318+ {
319+ alignmentPeriod : '60s' ,
320+ crossSeriesReducer : 'REDUCE_SUM' ,
321+ groupByFields : [ 'metric.label.quota_metric' ] ,
322+ perSeriesAligner : 'ALIGN_COUNT_TRUE' ,
323+ } ,
324+ ] ,
325+ comparison : 'COMPARISON_GT' ,
326+ duration : '60s' ,
327+ filter :
328+ 'resource.type="consumer_quota" AND metric.type="serviceruntime.googleapis.com/quota/exceeded"' ,
329+ trigger : {
330+ count : 1 ,
331+ } ,
332+ } ,
333+ } ,
334+ ] ,
335+ } ) ;
336+
337+ // Allocation quotas track consumed capacity (for example CPUs, IPs, disk)
338+ // against fixed limits, while rate quotas track request throughput over time
339+ // windows (for example API calls per minute). These are tracked separately so
340+ // we have separate alerts for them
341+
342+ new gcp . monitoring . AlertPolicy ( 'quotaAllocationAlert' , {
343+ ...baseArgs ,
344+ displayName : `Allocation Quota approaching limit (>${ quotaUsageThresholdPercent } %) in ${ CLUSTER_BASENAME } ` ,
345+ conditions : [
346+ {
347+ // Tracks resources like CPUs, Static IPs, Disk Space
348+ displayName : `Allocation Quota approaching limit (>${ quotaUsageThresholdPercent } %) in ${ CLUSTER_BASENAME } ` ,
349+ conditionPrometheusQueryLanguage : {
350+ query : `
351+ serviceruntime_googleapis_com:quota_allocation_usage{monitored_resource="consumer_quota"}
352+ / ignoring(limit_name) group_right()
353+ (serviceruntime_googleapis_com:quota_limit{monitored_resource="consumer_quota"} > 0)
354+ > ${ quotaUsageThreshold }
355+ ` ,
356+ duration : '300s' ,
357+ } ,
358+ } ,
359+ ] ,
360+ } ) ;
361+
362+ new gcp . monitoring . AlertPolicy ( 'quotaRateAlert' , {
363+ ...baseArgs ,
364+ displayName : `Rate Quota approaching limit (>${ quotaUsageThresholdPercent } %) in ${ CLUSTER_BASENAME } ` ,
365+ conditions : [
366+ {
367+ // Tracks API requests, HSM operations per minute, etc.
368+ displayName : `Rate Quota approaching limit (>${ quotaUsageThresholdPercent } %) in ${ CLUSTER_BASENAME } ` ,
369+ conditionPrometheusQueryLanguage : {
370+ query : `
371+ serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}
372+ / ignoring(limit_name) group_right()
373+ (serviceruntime_googleapis_com:quota_limit{monitored_resource="consumer_quota"} > 0)
374+ > ${ quotaUsageThreshold }
375+ ` ,
376+ duration : '300s' ,
377+ } ,
378+ } ,
379+ ] ,
380+ } ) ;
381+ }
0 commit comments