Fix ollama arguments (#395)

stellasia · web-flow · commit e91453b6d7c0 · 2025-08-01T14:29:05.000+02:00
* Fix ollama arguments

* Fix CI + update doc and examples

* CHANGELOG
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@
 ### Fixed
 
 - Fixed documentation for PdfLoader
+- Fixed a bug where the `format` argument for `OllamaLLM` was not propagated to the client.
+
 
 ## 1.9.0
 
diff --git a/docs/source/user_guide_rag.rst b/docs/source/user_guide_rag.rst
@@ -225,6 +225,7 @@ it can be queried using the following:
     from neo4j_graphrag.llm import OllamaLLM
     llm = OllamaLLM(
         model_name="orca-mini",
+        # model_params={"options": {"temperature": 0}, "format": "json"},
         # host="...",  # when using a remote server
     )
     llm.invoke("say something")
@@ -305,17 +306,17 @@ Default Rate Limit Handler
 Rate limiting is enabled by default for all LLM instances with the following configuration:
 
 - **Max attempts**: 3
-- **Min wait**: 1.0 seconds  
+- **Min wait**: 1.0 seconds
 - **Max wait**: 60.0 seconds
 - **Multiplier**: 2.0 (exponential backoff)
 
 .. code:: python
 
     from neo4j_graphrag.llm import OpenAILLM
-    
+
     # Rate limiting is automatically enabled
     llm = OpenAILLM(model_name="gpt-4o")
-    
+
     # The LLM will automatically retry on rate limit errors
     response = llm.invoke("Hello, world!")
 
@@ -327,7 +328,7 @@ Rate limiting is enabled by default for all LLM instances with the following con
 
         from neo4j_graphrag.llm import OpenAILLM
         from neo4j_graphrag.llm.rate_limit import RetryRateLimitHandler
-        
+
         # Customize rate limiting parameters
         llm = OpenAILLM(
             model_name="gpt-4o",
@@ -348,15 +349,15 @@ You can customize the rate limiting behavior by creating your own rate limit han
 
     from neo4j_graphrag.llm import AnthropicLLM
     from neo4j_graphrag.llm.rate_limit import RateLimitHandler
-    
+
     class CustomRateLimitHandler(RateLimitHandler):
         """Implement your custom rate limiting strategy."""
         # Implement required methods: handle_sync, handle_async
         pass
-    
+
     # Create custom rate limit handler and pass it to the LLM interface
     custom_handler = CustomRateLimitHandler()
-    
+
     llm = AnthropicLLM(
         model_name="claude-3-sonnet-20240229",
         rate_limit_handler=custom_handler,
@@ -370,7 +371,7 @@ For high-throughput applications or when you handle rate limiting externally, yo
 .. code:: python
 
     from neo4j_graphrag.llm import CohereLLM, NoOpRateLimitHandler
-    
+
     # Disable rate limiting completely
     llm = CohereLLM(
         model_name="command-r-plus",
diff --git a/examples/customize/llms/ollama_llm.py b/examples/customize/llms/ollama_llm.py
@@ -6,6 +6,7 @@
 
 llm = OllamaLLM(
     model_name="<model_name>",
+    # model_params={"options": {"temperature": 0}, "format": "json"},
     # host="...",  # if using a remote server
 )
 res: LLMResponse = llm.invoke("What is the additive color model?")
diff --git a/src/neo4j_graphrag/llm/ollama_llm.py b/src/neo4j_graphrag/llm/ollama_llm.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, Union, cast
 
 from pydantic import ValidationError
@@ -59,6 +60,19 @@ def __init__(
         self.async_client = ollama.AsyncClient(
             **kwargs,
         )
+        if "stream" in self.model_params:
+            raise ValueError("Streaming is not supported by the OllamaLLM wrapper")
+        # bug-fix with backward compatibility:
+        # we mistakenly passed all "model_params" under the options argument
+        # next two lines to be removed in 2.0
+        if not any(
+            key in self.model_params for key in ("options", "format", "keep_alive")
+        ):
+            warnings.warn(
+                """Passing options directly without including them in an 'options' key is deprecated. Ie you must use model_params={"options": {"temperature": 0}}""",
+                DeprecationWarning,
+            )
+            self.model_params = {"options": self.model_params}
 
     def get_messages(
         self,
@@ -104,7 +118,7 @@ def invoke(
             response = self.client.chat(
                 model=self.model_name,
                 messages=self.get_messages(input, message_history, system_instruction),
-                options=self.model_params,
+                **self.model_params,
             )
             content = response.message.content or ""
             return LLMResponse(content=content)
diff --git a/tests/unit/llm/test_ollama_llm.py b/tests/unit/llm/test_ollama_llm.py
@@ -35,28 +35,80 @@ def test_ollama_llm_missing_dependency(mock_import: Mock) -> None:
 
 
 @patch("builtins.__import__")
-def test_ollama_llm_happy_path(mock_import: Mock) -> None:
+def test_ollama_llm_happy_path_deprecated_options(mock_import: Mock) -> None:
     mock_ollama = get_mock_ollama()
     mock_import.return_value = mock_ollama
     mock_ollama.Client.return_value.chat.return_value = MagicMock(
         message=MagicMock(content="ollama chat response"),
     )
     model = "gpt"
     model_params = {"temperature": 0.3}
+    with pytest.warns(DeprecationWarning) as record:
+        llm = OllamaLLM(
+            model,
+            model_params=model_params,
+        )
+    assert len(record) == 1
+    assert isinstance(record[0].message, Warning)
+    assert (
+        'you must use model_params={"options": {"temperature": 0}}'
+        in record[0].message.args[0]
+    )
+
+    question = "What is graph RAG?"
+    res = llm.invoke(question)
+    assert isinstance(res, LLMResponse)
+    assert res.content == "ollama chat response"
+    messages = [
+        {"role": "user", "content": question},
+    ]
+    llm.client.chat.assert_called_once_with(  # type: ignore[attr-defined]
+        model=model, messages=messages, options={"temperature": 0.3}
+    )
+
+
+@patch("builtins.__import__")
+def test_ollama_llm_unsupported_streaming(mock_import: Mock) -> None:
+    mock_ollama = get_mock_ollama()
+    mock_import.return_value = mock_ollama
+    mock_ollama.Client.return_value.chat.return_value = MagicMock(
+        message=MagicMock(content="ollama chat response"),
+    )
+    model = "gpt"
+    model_params = {"stream": True}
+    with pytest.raises(ValueError):
+        OllamaLLM(
+            model,
+            model_params=model_params,
+        )
+
+
+@patch("builtins.__import__")
+def test_ollama_llm_happy_path(mock_import: Mock) -> None:
+    mock_ollama = get_mock_ollama()
+    mock_import.return_value = mock_ollama
+    mock_ollama.Client.return_value.chat.return_value = MagicMock(
+        message=MagicMock(content="ollama chat response"),
+    )
+    model = "gpt"
+    options = {"temperature": 0.3}
+    model_params = {"options": options, "format": "json"}
     question = "What is graph RAG?"
     llm = OllamaLLM(
-        model,
+        model_name=model,
         model_params=model_params,
     )
-
     res = llm.invoke(question)
     assert isinstance(res, LLMResponse)
     assert res.content == "ollama chat response"
     messages = [
         {"role": "user", "content": question},
     ]
     llm.client.chat.assert_called_once_with(  # type: ignore[attr-defined]
-        model=model, messages=messages, options=model_params
+        model=model,
+        messages=messages,
+        options=options,
+        format="json",
     )
 
 
@@ -68,7 +120,8 @@ def test_ollama_invoke_with_system_instruction_happy_path(mock_import: Mock) ->
         message=MagicMock(content="ollama chat response"),
     )
     model = "gpt"
-    model_params = {"temperature": 0.3}
+    options = {"temperature": 0.3}
+    model_params = {"options": options, "format": "json"}
     llm = OllamaLLM(
         model,
         model_params=model_params,
@@ -81,7 +134,10 @@ def test_ollama_invoke_with_system_instruction_happy_path(mock_import: Mock) ->
     messages = [{"role": "system", "content": system_instruction}]
     messages.append({"role": "user", "content": question})
     llm.client.chat.assert_called_once_with(  # type: ignore[attr-defined]
-        model=model, messages=messages, options=model_params
+        model=model,
+        messages=messages,
+        options=options,
+        format="json",
     )
 
 
@@ -93,7 +149,8 @@ def test_ollama_invoke_with_message_history_happy_path(mock_import: Mock) -> Non
         message=MagicMock(content="ollama chat response"),
     )
     model = "gpt"
-    model_params = {"temperature": 0.3}
+    options = {"temperature": 0.3}
+    model_params = {"options": options}
     llm = OllamaLLM(
         model,
         model_params=model_params,
@@ -109,7 +166,7 @@ def test_ollama_invoke_with_message_history_happy_path(mock_import: Mock) -> Non
     messages = [m for m in message_history]
     messages.append({"role": "user", "content": question})
     llm.client.chat.assert_called_once_with(  # type: ignore[attr-defined]
-        model=model, messages=messages, options=model_params
+        model=model, messages=messages, options=options
     )
 
 
@@ -123,7 +180,8 @@ def test_ollama_invoke_with_message_history_and_system_instruction(
         message=MagicMock(content="ollama chat response"),
     )
     model = "gpt"
-    model_params = {"temperature": 0.3}
+    options = {"temperature": 0.3}
+    model_params = {"options": options}
     system_instruction = "You are a helpful assistant."
     llm = OllamaLLM(
         model,
@@ -145,7 +203,7 @@ def test_ollama_invoke_with_message_history_and_system_instruction(
     messages.extend(message_history)
     messages.append({"role": "user", "content": question})
     llm.client.chat.assert_called_once_with(  # type: ignore[attr-defined]
-        model=model, messages=messages, options=model_params
+        model=model, messages=messages, options=options
     )
     assert llm.client.chat.call_count == 1  # type: ignore
 
@@ -156,7 +214,8 @@ def test_ollama_invoke_with_message_history_validation_error(mock_import: Mock)
     mock_import.return_value = mock_ollama
     mock_ollama.ResponseError = ollama.ResponseError
     model = "gpt"
-    model_params = {"temperature": 0.3}
+    options = {"temperature": 0.3}
+    model_params = {"options": options}
     system_instruction = "You are a helpful assistant."
     llm = OllamaLLM(
         model,
@@ -187,7 +246,8 @@ async def mock_chat_async(*args: Any, **kwargs: Any) -> MagicMock:
 
     mock_ollama.AsyncClient.return_value.chat = mock_chat_async
     model = "gpt"
-    model_params = {"temperature": 0.3}
+    options = {"temperature": 0.3}
+    model_params = {"options": options}
     question = "What is graph RAG?"
     llm = OllamaLLM(
         model,

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`
`7`	`7`	`llm = OllamaLLM(`
`8`	`8`	`model_name="<model_name>",`
	`9`	`+ # model_params={"options": {"temperature": 0}, "format": "json"},`
`9`	`10`	`# host="...", # if using a remote server`
`10`	`11`	`)`
`11`	`12`	`res: LLMResponse = llm.invoke("What is the additive color model?")`