diff --git a/packages/chatbot-server-mongodb-public/environments/production.yml b/packages/chatbot-server-mongodb-public/environments/production.yml index 57fc586d9..ff4529f16 100644 --- a/packages/chatbot-server-mongodb-public/environments/production.yml +++ b/packages/chatbot-server-mongodb-public/environments/production.yml @@ -103,3 +103,40 @@ prometheusRules: annotations: summary: High HTTP 500 Error rate on {$labels.job} description: Too many HTTP 500 Errors on {$labels.job} in the last 5 minutes + - alert: HighRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 10 + ) + for: 2m + labels: + severity: warning + category: performance + annotations: + summary: "High rate of 429 (rate limit) errors detected" + description: | + Service {{ $labels.service }} is experiencing high rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + This may indicate: + - Client retry storms + - Insufficient rate limiting configuration + - Upstream service throttling + - alert: CriticalRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 1.0 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 50 + ) + for: 1m + labels: + severity: critical + category: performance + annotations: + summary: "Critical rate of 429 (rate limit) errors" + description: | + Service {{ $labels.service }} is experiencing critical rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + Immediate investigation required - service may be degraded. diff --git a/packages/chatbot-server-mongodb-public/environments/staging.yml b/packages/chatbot-server-mongodb-public/environments/staging.yml index 40c14da88..891dee4b4 100644 --- a/packages/chatbot-server-mongodb-public/environments/staging.yml +++ b/packages/chatbot-server-mongodb-public/environments/staging.yml @@ -103,3 +103,40 @@ prometheusRules: annotations: summary: High HTTP 500 Error rate on {$labels.job} description: Too many HTTP 500 Errors on {$labels.job} in the last 5 minutes + - alert: HighRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 10 + ) + for: 2m + labels: + severity: warning + category: performance + annotations: + summary: "High rate of 429 (rate limit) errors detected" + description: | + Service {{ $labels.service }} is experiencing high rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + This may indicate: + - Client retry storms + - Insufficient rate limiting configuration + - Upstream service throttling + - alert: CriticalRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 1.0 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 50 + ) + for: 1m + labels: + severity: critical + category: performance + annotations: + summary: "Critical rate of 429 (rate limit) errors" + description: | + Service {{ $labels.service }} is experiencing critical rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + Immediate investigation required - service may be degraded.