refactor: standardize InferenceRouter model handling (#2965)

nathan-weinberg · web-flow · commit 19123ca957d9 · 2025-08-12T04:20:39.000-06:00
diff --git a/llama_stack/apis/common/errors.py b/llama_stack/apis/common/errors.py
@@ -62,3 +62,13 @@ class SessionNotFoundError(ValueError):
     def __init__(self, session_name: str) -> None:
         message = f"Session '{session_name}' not found or access denied."
         super().__init__(message)
+
+
+class ModelTypeError(TypeError):
+    """raised when a model is present but not the correct type"""
+
+    def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
+        message = (
+            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
+        )
+        super().__init__(message)
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
@@ -18,7 +18,7 @@
     InterleavedContent,
     InterleavedContentItem,
 )
-from llama_stack.apis.common.errors import ModelNotFoundError
+from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.inference import (
     BatchChatCompletionResponse,
     BatchCompletionResponse,
@@ -177,6 +177,15 @@ async def _count_tokens(
             encoded = self.formatter.encode_content(messages)
         return len(encoded.tokens) if encoded and encoded.tokens else 0
 
+    async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
+        """takes a model id and gets model after ensuring that it is accessible and of the correct type"""
+        model = await self.routing_table.get_model(model_id)
+        if model is None:
+            raise ModelNotFoundError(model_id)
+        if model.model_type != expected_model_type:
+            raise ModelTypeError(model_id, model.model_type, expected_model_type)
+        return model
+
     async def chat_completion(
         self,
         model_id: str,
@@ -195,11 +204,7 @@ async def chat_completion(
         )
         if sampling_params is None:
             sampling_params = SamplingParams()
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
-            raise ModelNotFoundError(model_id)
-        if model.model_type == ModelType.embedding:
-            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
+        model = await self._get_model(model_id, ModelType.llm)
         if tool_config:
             if tool_choice and tool_choice != tool_config.tool_choice:
                 raise ValueError("tool_choice and tool_config.tool_choice must match")
@@ -301,11 +306,7 @@ async def completion(
         logger.debug(
             f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
         )
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
-            raise ModelNotFoundError(model_id)
-        if model.model_type == ModelType.embedding:
-            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
+        model = await self._get_model(model_id, ModelType.llm)
         provider = await self.routing_table.get_provider_impl(model_id)
         params = dict(
             model_id=model_id,
@@ -355,11 +356,7 @@ async def embeddings(
         task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         logger.debug(f"InferenceRouter.embeddings: {model_id}")
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
-            raise ModelNotFoundError(model_id)
-        if model.model_type == ModelType.llm:
-            raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
+        await self._get_model(model_id, ModelType.embedding)
         provider = await self.routing_table.get_provider_impl(model_id)
         return await provider.embeddings(
             model_id=model_id,
@@ -395,12 +392,7 @@ async def openai_completion(
         logger.debug(
             f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
         )
-        model_obj = await self.routing_table.get_model(model)
-        if model_obj is None:
-            raise ModelNotFoundError(model)
-        if model_obj.model_type == ModelType.embedding:
-            raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
-
+        model_obj = await self._get_model(model, ModelType.llm)
         params = dict(
             model=model_obj.identifier,
             prompt=prompt,
@@ -476,11 +468,7 @@ async def openai_chat_completion(
         logger.debug(
             f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
         )
-        model_obj = await self.routing_table.get_model(model)
-        if model_obj is None:
-            raise ModelNotFoundError(model)
-        if model_obj.model_type == ModelType.embedding:
-            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
+        model_obj = await self._get_model(model, ModelType.llm)
 
         # Use the OpenAI client for a bit of extra input validation without
         # exposing the OpenAI client itself as part of our API surface
@@ -567,12 +555,7 @@ async def openai_embeddings(
         logger.debug(
             f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
         )
-        model_obj = await self.routing_table.get_model(model)
-        if model_obj is None:
-            raise ModelNotFoundError(model)
-        if model_obj.model_type != ModelType.embedding:
-            raise ValueError(f"Model '{model}' is not an embedding model")
-
+        model_obj = await self._get_model(model, ModelType.embedding)
         params = dict(
             model=model_obj.identifier,
             input=input,
diff --git a/llama_stack/core/routing_tables/vector_dbs.py b/llama_stack/core/routing_tables/vector_dbs.py
@@ -8,7 +8,7 @@
 
 from pydantic import TypeAdapter
 
-from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError
+from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError, VectorStoreNotFoundError
 from llama_stack.apis.models import ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
@@ -66,7 +66,7 @@ async def register_vector_db(
         if model is None:
             raise ModelNotFoundError(embedding_model)
         if model.model_type != ModelType.embedding:
-            raise ValueError(f"Model {embedding_model} is not an embedding model")
+            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
         if "embedding_dimension" not in model.metadata:
             raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
         vector_db_data = {
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -457,9 +457,6 @@ async def openai_embeddings(
         user: str | None = None,
     ) -> OpenAIEmbeddingsResponse:
         model_obj = await self._get_model(model)
-        if model_obj.model_type != ModelType.embedding:
-            raise ValueError(f"Model {model} is not an embedding model")
-
         if model_obj.provider_resource_id is None:
             raise ValueError(f"Model {model} has no provider_resource_id set")