Merge pull request #152 from cloudera/mob/main

ewilliams-cloudera · web-flow · commit 5bf0b04eb1dc · 2025-03-05T09:01:12.000-07:00
Use the NVidia library for CAII models, and tweak summarization config to account for Mistral's small context window
diff --git a/llm-service/app/ai/indexing/summary_indexer.py b/llm-service/app/ai/indexing/summary_indexer.py
@@ -47,7 +47,7 @@
     DocumentSummaryIndex,
     StorageContext,
     get_response_synthesizer,
-    load_index_from_storage,
+    load_index_from_storage, PromptHelper,
 )
 from llama_index.core.base.base_query_engine import BaseQueryEngine
 from llama_index.core.base.embeddings.base import BaseEmbedding
@@ -66,6 +66,7 @@
 from .readers.base_reader import ReaderConfig, ChunksResult
 from ..vector_stores.qdrant import QdrantVectorStore
 from ...config import Settings
+from ...services.models import CAIIModelProvider
 
 logger = logging.getLogger(__name__)
 
@@ -117,13 +118,18 @@ def __index_configuration(
         data_source_id: int,
         embed_summaries: bool = True,
     ) -> Dict[str, Any]:
+        prompt_helper: Optional[PromptHelper] = None
+        # if we're using CAII, let's be conservative, and use a small context window to account for mistral's small context
+        if CAIIModelProvider.is_enabled():
+            prompt_helper=PromptHelper(context_window=3000)
         return {
             "llm": llm,
             "response_synthesizer": get_response_synthesizer(
                 response_mode=ResponseMode.TREE_SUMMARIZE,
                 llm=llm,
                 use_async=True,
                 verbose=True,
+                prompt_helper=prompt_helper
             ),
             "show_progress": True,
             "embed_model": embedding_model,
diff --git a/llm-service/app/services/caii/CaiiModel.py b/llm-service/app/services/caii/CaiiModel.py
@@ -38,7 +38,6 @@
 from typing import Callable, Dict, Sequence, Any
 
 from llama_index.core.base.llms.types import ChatMessage, LLMMetadata, ChatResponse, CompletionResponse
-from llama_index.llms.mistralai.base import MistralAI
 from llama_index.llms.openai import OpenAI
 from pydantic import Field
 
@@ -108,41 +107,3 @@ def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
         content: str = raw_response.message.content or ""
         raw_response.message.content = content.split("</think>")[-1]
         return raw_response
-
-
-class CaiiModelMistral(MistralAI):
-    def __init__(
-        self,
-        model: str,
-        context: int,
-        api_base: str,
-        messages_to_prompt: Callable[[Sequence[ChatMessage]], str],
-        completion_to_prompt: Callable[[str], str],
-        default_headers: Dict[str, str],
-    ):
-        super().__init__(
-            api_key=default_headers.get("Authorization"),
-            model=model,
-            endpoint=api_base.removesuffix(
-                "/v1"
-            ),  # mistral expects the base url without the /v1
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
-        )
-
-    def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        all_kwargs = super()._get_all_kwargs(**kwargs)
-        # apparently, this key is no longer acceptable to the API that is implemented by the Nvidia NIMs for mistral.
-        all_kwargs.pop('random_seed', None)
-        return all_kwargs
-
-    @property
-    def metadata(self) -> LLMMetadata:
-        ## todo: pull this info from somewhere
-        return LLMMetadata(
-            context_window=32000,  ## this is the minimum mistral context window from utils.py
-            num_output=self.max_tokens or -1,
-            is_chat_model=False,
-            is_function_calling_model=True,
-            model_name=self.model,
-        )
diff --git a/llm-service/app/services/caii/caii.py b/llm-service/app/services/caii/caii.py
@@ -45,9 +45,10 @@
 from llama_index.core.base.llms.types import ChatMessage
 from llama_index.core.llms import LLM
 from llama_index.core.postprocessor.types import BaseNodePostprocessor
+from llama_index.llms.nvidia import NVIDIA
 
 from .CaiiEmbeddingModel import CaiiEmbeddingModel
-from .CaiiModel import CaiiModel, CaiiModelMistral, DeepseekModel
+from .CaiiModel import DeepseekModel
 from .caii_reranking import CaiiRerankingModel
 from .types import Endpoint, ListEndpointEntry, ModelResponse
 from .utils import build_auth_headers, get_caii_access_token
@@ -103,43 +104,29 @@ def get_llm(
 ) -> LLM:
     endpoint = describe_endpoint(endpoint_name=endpoint_name)
     api_base = endpoint.url.removesuffix("/chat/completions")
-    headers = build_auth_headers()
 
     model = endpoint.model_name
+    # todo: test if the NVIDIA impl works with deepseek, too
     if "deepseek" in endpoint_name.lower():
         return DeepseekModel(
             model=model,
             context=128000,
             messages_to_prompt=messages_to_prompt,
             completion_to_prompt=completion_to_prompt,
             api_base=api_base,
-            default_headers=headers,
-        )
-
-    if "mistral" in endpoint_name.lower():
-        return CaiiModelMistral(
-            model=model,
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
-            api_base=api_base,
-            context=128000,
-            default_headers=headers,
-        )
-
-    else:
-        return CaiiModel(
-            model=model,
-            context=128000,
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
-            api_base=api_base,
-            default_headers=headers,
+            default_headers=(build_auth_headers()),
         )
+    return NVIDIA(
+        api_key=get_caii_access_token(),
+        base_url=api_base,
+        model=model
+    )
 
 
 def get_embedding_model(model_name: str) -> BaseEmbedding:
     endpoint_name = model_name
     endpoint = describe_endpoint(endpoint_name=endpoint_name)
+    # todo: figure out if the Nvidia library can be made to work for embeddings as well.
     return CaiiEmbeddingModel(endpoint=endpoint)
 
 
diff --git a/llm-service/app/services/models/__init__.py b/llm-service/app/services/models/__init__.py
@@ -65,6 +65,15 @@
 from ..llama_utils import completion_to_prompt, messages_to_prompt
 from ..query.simple_reranker import SimpleReranker
 
+__all__ = [
+    'CAIIModelProvider',
+    'ModelType',
+    'Embedding',
+    'LLM',
+    'Reranking',
+    'ModelSource',
+    'BedrockModelProvider'
+]
 
 T = TypeVar("T", bound=BaseComponent)
 
diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py
@@ -56,7 +56,7 @@
 from app.services.metadata_apis import data_sources_metadata_api
 from app.services import models
 from app.services.metadata_apis.data_sources_metadata_api import RagDataSource
-from app.services.models._bedrock import BedrockModelProvider
+from app.services.models import BedrockModelProvider
 
 
 @dataclass
diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
@@ -15,7 +15,6 @@ dependencies = [
     "llama-index-embeddings-bedrock>=0.2.1",
     "llama-index-llms-bedrock>=0.1.13",
     "llama-index-llms-openai>=0.1.31",
-    "llama-index-llms-mistralai>=0.1.20",
     "llama-index-embeddings-openai>=0.1.11",
     "llama-index-vector-stores-qdrant>=0.2.17",
     "docx2txt>=0.8",
@@ -37,6 +36,7 @@ dependencies = [
     "mlflow>=2.20.1",
     "llama-index-llms-azure-openai>=0.3.0",
     "llama-index-embeddings-azure-openai>=0.3.0",
+    "llama-index-llms-nvidia>=0.3.2",
 ]
 requires-python = ">=3.10,<=3.12"
 readme = "README.md"
diff --git a/llm-service/uv.lock b/llm-service/uv.lock