Skip to content

Commit 56dec8d

Browse files
committed
fix(prometheus): add _safe_labels helper to propagate exclude_labels to observation sites
When prometheus_exclude_labels strips labels at registration time, observation call sites that still pass the full positional/keyword label set raise ValueError. Add _safe_labels(metric, **kwargs) that filters kwargs to the metric's registered _labelnames. Apply it to: - guardrail observation calls (litellm_guardrail_latency/requests/errors) - litellm_remaining_api_key_requests/tokens_for_model (convert positional → keyword) - litellm_deployment_cooled_down (convert positional → keyword) - litellm_llm_api_failed_requests_metric (convert positional → keyword)
1 parent 7d3cad1 commit 56dec8d

File tree

1 file changed

+68
-33
lines changed

1 file changed

+68
-33
lines changed

litellm/integrations/prometheus.py

Lines changed: 68 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,20 @@ def get_labels_for_metric(
923923

924924
return labels
925925

926+
def _safe_labels(self, metric, **kwargs) -> dict:
927+
"""Return only the label kwargs that are registered for the metric.
928+
929+
When prometheus_exclude_labels strips labels at registration time, observation
930+
call sites must supply only the registered subset or prometheus_client raises
931+
ValueError (label count mismatch).
932+
"""
933+
if isinstance(metric, NoOpMetric):
934+
return kwargs
935+
registered = getattr(metric, "_labelnames", None)
936+
if registered is None:
937+
return kwargs
938+
return {k: v for k, v in kwargs.items() if k in registered}
939+
926940
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
927941
# Define prometheus client
928942
from litellm.types.utils import StandardLoggingPayload
@@ -1326,17 +1340,23 @@ def _set_virtual_key_rate_limit_metrics(
13261340
)
13271341

13281342
self.litellm_remaining_api_key_requests_for_model.labels(
1329-
_sanitize_prometheus_label_value(user_api_key),
1330-
_sanitize_prometheus_label_value(user_api_key_alias),
1331-
_sanitize_prometheus_label_value(model_group),
1332-
_sanitize_prometheus_label_value(model_id),
1343+
**self._safe_labels(
1344+
self.litellm_remaining_api_key_requests_for_model,
1345+
hashed_api_key=_sanitize_prometheus_label_value(user_api_key),
1346+
api_key_alias=_sanitize_prometheus_label_value(user_api_key_alias),
1347+
model=_sanitize_prometheus_label_value(model_group),
1348+
model_id=_sanitize_prometheus_label_value(model_id),
1349+
)
13331350
).set(remaining_requests)
13341351

13351352
self.litellm_remaining_api_key_tokens_for_model.labels(
1336-
_sanitize_prometheus_label_value(user_api_key),
1337-
_sanitize_prometheus_label_value(user_api_key_alias),
1338-
_sanitize_prometheus_label_value(model_group),
1339-
_sanitize_prometheus_label_value(model_id),
1353+
**self._safe_labels(
1354+
self.litellm_remaining_api_key_tokens_for_model,
1355+
hashed_api_key=_sanitize_prometheus_label_value(user_api_key),
1356+
api_key_alias=_sanitize_prometheus_label_value(user_api_key_alias),
1357+
model=_sanitize_prometheus_label_value(model_group),
1358+
model_id=_sanitize_prometheus_label_value(model_id),
1359+
)
13401360
).set(remaining_tokens)
13411361

13421362
def _set_latency_metrics(
@@ -1457,16 +1477,19 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti
14571477

14581478
try:
14591479
self.litellm_llm_api_failed_requests_metric.labels(
1460-
_sanitize_prometheus_label_value(end_user_id),
1461-
_sanitize_prometheus_label_value(user_api_key),
1462-
_sanitize_prometheus_label_value(user_api_key_alias),
1463-
_sanitize_prometheus_label_value(model),
1464-
_sanitize_prometheus_label_value(user_api_team),
1465-
_sanitize_prometheus_label_value(user_api_team_alias),
1466-
_sanitize_prometheus_label_value(user_id),
1467-
_sanitize_prometheus_label_value(
1468-
standard_logging_payload.get("model_id", "")
1469-
),
1480+
**self._safe_labels(
1481+
self.litellm_llm_api_failed_requests_metric,
1482+
end_user=_sanitize_prometheus_label_value(end_user_id),
1483+
hashed_api_key=_sanitize_prometheus_label_value(user_api_key),
1484+
api_key_alias=_sanitize_prometheus_label_value(user_api_key_alias),
1485+
model=_sanitize_prometheus_label_value(model),
1486+
team=_sanitize_prometheus_label_value(user_api_team),
1487+
team_alias=_sanitize_prometheus_label_value(user_api_team_alias),
1488+
user=_sanitize_prometheus_label_value(user_id),
1489+
model_id=_sanitize_prometheus_label_value(
1490+
standard_logging_payload.get("model_id", "")
1491+
),
1492+
)
14701493
).inc()
14711494
self.set_llm_deployment_failure_metrics(kwargs)
14721495
except Exception as e:
@@ -2180,25 +2203,34 @@ def _record_guardrail_metrics(
21802203
try:
21812204
# Record latency
21822205
self.litellm_guardrail_latency_metric.labels(
2183-
guardrail_name=guardrail_name,
2184-
status=status,
2185-
error_type=error_type or "none",
2186-
hook_type=hook_type,
2206+
**self._safe_labels(
2207+
self.litellm_guardrail_latency_metric,
2208+
guardrail_name=guardrail_name,
2209+
status=status,
2210+
error_type=error_type or "none",
2211+
hook_type=hook_type,
2212+
)
21872213
).observe(latency_seconds)
21882214

21892215
# Record request count
21902216
self.litellm_guardrail_requests_total.labels(
2191-
guardrail_name=guardrail_name,
2192-
status=status,
2193-
hook_type=hook_type,
2217+
**self._safe_labels(
2218+
self.litellm_guardrail_requests_total,
2219+
guardrail_name=guardrail_name,
2220+
status=status,
2221+
hook_type=hook_type,
2222+
)
21942223
).inc()
21952224

21962225
# Record error count if there was an error
21972226
if status == "error" and error_type:
21982227
self.litellm_guardrail_errors_total.labels(
2199-
guardrail_name=guardrail_name,
2200-
error_type=error_type,
2201-
hook_type=hook_type,
2228+
**self._safe_labels(
2229+
self.litellm_guardrail_errors_total,
2230+
guardrail_name=guardrail_name,
2231+
error_type=error_type,
2232+
hook_type=hook_type,
2233+
)
22022234
).inc()
22032235
except Exception as e:
22042236
verbose_logger.debug(f"Error recording guardrail metrics: {str(e)}")
@@ -2382,11 +2414,14 @@ def increment_deployment_cooled_down(
23822414
increment metric when litellm.Router / load balancing logic places a deployment in cool down
23832415
"""
23842416
self.litellm_deployment_cooled_down.labels(
2385-
_sanitize_prometheus_label_value(litellm_model_name),
2386-
_sanitize_prometheus_label_value(model_id),
2387-
_sanitize_prometheus_label_value(api_base),
2388-
_sanitize_prometheus_label_value(api_provider),
2389-
_sanitize_prometheus_label_value(exception_status),
2417+
**self._safe_labels(
2418+
self.litellm_deployment_cooled_down,
2419+
litellm_model_name=_sanitize_prometheus_label_value(litellm_model_name),
2420+
model_id=_sanitize_prometheus_label_value(model_id),
2421+
api_base=_sanitize_prometheus_label_value(api_base),
2422+
api_provider=_sanitize_prometheus_label_value(api_provider),
2423+
exception_status=_sanitize_prometheus_label_value(exception_status),
2424+
)
23902425
).inc()
23912426

23922427
def increment_callback_logging_failure(

0 commit comments

Comments
 (0)