feat: implement graceful model discovery for vLLM provider

akram · akram · commit f6c3e91171b3 · 2025-10-03T20:52:18.000+02:00
- Attempt model discovery first for backward compatibility
- If discovery fails and refresh_models=false, continue without error
- If discovery fails and refresh_models=true, fail hard with ValueError
- Supports dynamic token authentication scenarios

Fixes OAuth authentication issues when vLLM service requires dynamic tokens
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -430,16 +430,29 @@ async def register_model(self, model: Model) -> Model:
             pass  # Ignore statically unknown model, will check live listing
         try:
             res = self.client.models.list()
+            available_models = [m.id async for m in res]
+            if model.provider_resource_id not in available_models:
+                raise ValueError(
+                    f"Model {model.provider_resource_id} is not being served by vLLM. "
+                    f"Available models: {', '.join(available_models)}"
+                )
         except APIConnectionError as e:
-            raise ValueError(
-                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
-            ) from e
-        available_models = [m.id async for m in res]
-        if model.provider_resource_id not in available_models:
-            raise ValueError(
-                f"Model {model.provider_resource_id} is not being served by vLLM. "
-                f"Available models: {', '.join(available_models)}"
+            if self.config.refresh_models:
+                raise ValueError(
+                    f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
+                ) from e
+            # Otherwise, gracefully continue without verification
+            log.warning(
+                f"Failed to connect to vLLM at {self.config.url} for model verification. Continuing without live check (refresh_models=false)."
             )
+        except Exception as e:
+            if self.config.refresh_models:
+                raise ValueError(f"Model verification failed: {e}") from e
+            # Otherwise, gracefully continue without verification
+            log.warning(
+                f"Model verification failed for {model.provider_resource_id}: {e}. Continuing without live check (refresh_models=false)."
+            )
+
         return model
 
     async def _get_params(self, request: ChatCompletionRequest) -> dict: