neo4j
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/api.rst‎
Lines changed: 31 additions & 0 deletions b/‎docs/source/api.rst‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎docs/source/user_guide_rag.rst‎
Lines changed: 87 additions & 2 deletions b/‎docs/source/user_guide_rag.rst‎
Lines changed: 87 additions & 2 deletions
diff --git a/‎examples/customize/llms/custom_llm.py‎
Lines changed: 38 additions & 2 deletions b/‎examples/customize/llms/custom_llm.py‎
Lines changed: 38 additions & 2 deletions
diff --git a/‎poetry.lock‎
Lines changed: 5 additions & 5 deletions b/‎poetry.lock‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/neo4j_graphrag/exceptions.py‎
Lines changed: 4 additions & 0 deletions b/‎src/neo4j_graphrag/exceptions.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/neo4j_graphrag/llm/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎src/neo4j_graphrag/llm/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/neo4j_graphrag/llm/anthropic_llm.py‎
Lines changed: 9 additions & 1 deletion b/‎src/neo4j_graphrag/llm/anthropic_llm.py‎
Lines changed: 9 additions & 1 deletion
@@ -13,6 +13,7 @@
 - Support for Python 3.13
 - Added support for automatic schema extraction from text using LLMs. In the `SimpleKGPipeline`, when the user provides no schema, the automatic schema extraction is enabled by default.
 - Added ability to return a user-defined message if context is empty in GraphRAG (which skips the LLM call).
+- Added automatic rate limiting with retry logic and exponential backoff for all LLM providers using tenacity. The `RateLimitHandler` interface allows for custom rate limiting strategies, including the ability to disable rate limiting entirely.
 
 ### Fixed
 
 
@@ -347,6 +347,28 @@ MistralAILLM
     :members:
 
 
+Rate Limiting
+=============
+
+RateLimitHandler
+----------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.RateLimitHandler
+    :members:
+
+RetryRateLimitHandler
+---------------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.RetryRateLimitHandler
+    :members:
+
+NoOpRateLimitHandler
+--------------------
+
+.. autoclass:: neo4j_graphrag.llm.rate_limit.NoOpRateLimitHandler
+    :members:
+
+
 PromptTemplate
 ==============
 
@@ -473,6 +495,8 @@ Errors
 
   * :class:`neo4j_graphrag.exceptions.LLMGenerationError`
 
+    * :class:`neo4j_graphrag.exceptions.RateLimitError`
+
   * :class:`neo4j_graphrag.exceptions.SchemaValidationError`
 
   * :class:`neo4j_graphrag.exceptions.PdfLoaderError`
@@ -597,6 +621,13 @@ LLMGenerationError
    :show-inheritance:
 
 
+RateLimitError
+==============
+
+.. autoclass:: neo4j_graphrag.exceptions.RateLimitError
+   :show-inheritance:
+
+
 SchemaValidationError
 =====================
 
 
@@ -125,15 +125,15 @@ To use VertexAI, instantiate the `VertexAILLM` class:
 
     generation_config = GenerationConfig(temperature=0.0)
     llm = VertexAILLM(
-        model_name="gemini-1.5-flash-001", generation_config=generation_config
+        model_name="gemini-2.5-flash", generation_config=generation_config
     )
     llm.invoke("say something")
 
 
 .. note::
 
     In order to run this code, the `google-cloud-aiplatform` Python package needs to be installed:
-    `pip install "neo4j_grpahrag[vertexai]"`
+    `pip install "neo4j_graphrag[google]"`
 
 
 See :ref:`vertexaillm`.
@@ -294,6 +294,91 @@ Here's an example using the Python Ollama client:
 See :ref:`llminterface`.
 
 
+Rate Limit Handling
+===================
+
+All LLM implementations include automatic rate limiting that uses retry logic with exponential backoff by default. This feature helps handle API rate limits from LLM providers gracefully by automatically retrying failed requests with increasing wait times between attempts.
+
+Default Rate Limit Handler
+--------------------------
+
+Rate limiting is enabled by default for all LLM instances with the following configuration:
+
+- **Max attempts**: 3
+- **Min wait**: 1.0 seconds  
+- **Max wait**: 60.0 seconds
+- **Multiplier**: 2.0 (exponential backoff)
+
+.. code:: python
+
+    from neo4j_graphrag.llm import OpenAILLM
+    
+    # Rate limiting is automatically enabled
+    llm = OpenAILLM(model_name="gpt-4o")
+    
+    # The LLM will automatically retry on rate limit errors
+    response = llm.invoke("Hello, world!")
+
+.. note::
+
+   To change the default configuration of `RetryRateLimitHandler`:
+
+    .. code:: python
+
+        from neo4j_graphrag.llm import OpenAILLM
+        from neo4j_graphrag.llm.rate_limit import RetryRateLimitHandler
+        
+        # Customize rate limiting parameters
+        llm = OpenAILLM(
+            model_name="gpt-4o",
+            rate_limit_handler=RetryRateLimitHandler(
+                max_attempts=10,    # Increase max retry attempts
+                min_wait=2.0,       # Increase minimum wait time
+                max_wait=120.0,     # Increase maximum wait time
+                multiplier=3.0      # More aggressive backoff
+            )
+        )
+
+Custom Rate Limiting
+--------------------
+
+You can customize the rate limiting behavior by creating your own rate limit handler:
+
+.. code:: python
+
+    from neo4j_graphrag.llm import AnthropicLLM
+    from neo4j_graphrag.llm.rate_limit import RateLimitHandler
+    
+    class CustomRateLimitHandler(RateLimitHandler):
+        """Implement your custom rate limiting strategy."""
+        # Implement required methods: handle_sync, handle_async
+        pass
+    
+    # Create custom rate limit handler and pass it to the LLM interface
+    custom_handler = CustomRateLimitHandler()
+    
+    llm = AnthropicLLM(
+        model_name="claude-3-sonnet-20240229",
+        rate_limit_handler=custom_handler,
+    )
+
+Disabling Rate Limiting
+-----------------------
+
+For high-throughput applications or when you handle rate limiting externally, you can disable it:
+
+.. code:: python
+
+    from neo4j_graphrag.llm import CohereLLM, NoOpRateLimitHandler
+    
+    # Disable rate limiting completely
+    llm = CohereLLM(
+        model_name="command-r-plus",
+        rate_limit_handler=NoOpRateLimitHandler(),
+    )
+    llm.invoke("Hello, world!")
+
+
 Configuring the Prompt
 ========================
 
 
@@ -1,8 +1,13 @@
 import random
 import string
-from typing import Any, List, Optional, Union
+from typing import Any, Awaitable, Callable, List, Optional, TypeVar, Union
 
 from neo4j_graphrag.llm import LLMInterface, LLMResponse
+from neo4j_graphrag.llm.rate_limit import (
+    RateLimitHandler,
+    # rate_limit_handler,
+    # async_rate_limit_handler,
+)
 from neo4j_graphrag.message_history import MessageHistory
 from neo4j_graphrag.types import LLMMessage
 
@@ -13,6 +18,8 @@ def __init__(
     ):
         super().__init__(model_name, **kwargs)
 
+    # Optional: Apply rate limit handling to synchronous invoke method
+    # @rate_limit_handler
     def invoke(
         self,
         input: str,
@@ -24,6 +31,8 @@ def invoke(
         )
         return LLMResponse(content=content)
 
+    # Optional: Apply rate limit handling to asynchronous ainvoke method
+    # @async_rate_limit_handler
     async def ainvoke(
         self,
         input: str,
@@ -33,6 +42,33 @@ async def ainvoke(
         raise NotImplementedError()
 
 
-llm = CustomLLM("")
+llm = CustomLLM(
+    ""
+)  # if rate_limit_handler and async_rate_limit_handler decorators are used, the default rate limit handler will be applied automatically (retry with exponential backoff)
 res: LLMResponse = llm.invoke("text")
 print(res.content)
+
+# If rate_limit_handler and async_rate_limit_handler decorators are used and you want to use a custom rate limit handler
+# Type variables for function signatures used in rate limit handlers
+F = TypeVar("F", bound=Callable[..., Any])
+AF = TypeVar("AF", bound=Callable[..., Awaitable[Any]])
+
+
+class CustomRateLimitHandler(RateLimitHandler):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def handle_sync(self, func: F) -> F:
+        # error handling here
+        return func
+
+    def handle_async(self, func: AF) -> AF:
+        # error handling here
+        return func
+
+
+llm_with_custom_rate_limit_handler = CustomLLM(
+    "", rate_limit_handler=CustomRateLimitHandler()
+)
+result: LLMResponse = llm_with_custom_rate_limit_handler.invoke("text")
+print(result.content)
@@ -60,6 +60,7 @@ scipy = [
   { version = "^1.13.0", python = ">=3.9,<3.13" },
   { version = "^1.15.0", python = ">=3.13,<3.14" }
 ]
+tenacity = "^9.1.2"
 
 [tool.poetry.group.dev.dependencies]
 urllib3 = "<2"
 
@@ -138,3 +138,7 @@ class InvalidHybridSearchRankerError(Neo4jGraphRagError):
 
 class SearchQueryParseError(Neo4jGraphRagError):
     """Exception raised when there is a query parse error in the text search string."""
+
+
+class RateLimitError(LLMGenerationError):
+    """Exception raised when API rate limit is exceeded."""
@@ -18,6 +18,13 @@
 from .mistralai_llm import MistralAILLM
 from .ollama_llm import OllamaLLM
 from .openai_llm import AzureOpenAILLM, OpenAILLM
+from .rate_limit import (
+    RateLimitHandler,
+    NoOpRateLimitHandler,
+    RetryRateLimitHandler,
+    rate_limit_handler,
+    async_rate_limit_handler,
+)
 from .types import LLMResponse
 from .vertexai_llm import VertexAILLM
 
@@ -31,4 +38,10 @@
     "VertexAILLM",
     "AzureOpenAILLM",
     "MistralAILLM",
+    # Rate limiting components
+    "RateLimitHandler",
+    "NoOpRateLimitHandler",
+    "RetryRateLimitHandler",
+    "rate_limit_handler",
+    "async_rate_limit_handler",
 ]
@@ -19,6 +19,11 @@
 
 from neo4j_graphrag.exceptions import LLMGenerationError
 from neo4j_graphrag.llm.base import LLMInterface
+from neo4j_graphrag.llm.rate_limit import (
+    RateLimitHandler,
+    rate_limit_handler,
+    async_rate_limit_handler,
+)
 from neo4j_graphrag.llm.types import (
     BaseMessage,
     LLMResponse,
@@ -62,6 +67,7 @@ def __init__(
         self,
         model_name: str,
         model_params: Optional[dict[str, Any]] = None,
+        rate_limit_handler: Optional[RateLimitHandler] = None,
         **kwargs: Any,
     ):
         try:
@@ -71,7 +77,7 @@ def __init__(
                 """Could not import Anthropic Python client.
                 Please install it with `pip install "neo4j-graphrag[anthropic]"`."""
             )
-        super().__init__(model_name, model_params)
+        super().__init__(model_name, model_params, rate_limit_handler)
         self.anthropic = anthropic
         self.client = anthropic.Anthropic(**kwargs)
         self.async_client = anthropic.AsyncAnthropic(**kwargs)
@@ -93,6 +99,7 @@ def get_messages(
         messages.append(UserMessage(content=input).model_dump())
         return messages  # type: ignore
 
+    @rate_limit_handler
     def invoke(
         self,
         input: str,
@@ -129,6 +136,7 @@ def invoke(
         except self.anthropic.APIError as e:
             raise LLMGenerationError(e)
 
+    @async_rate_limit_handler
     async def ainvoke(
         self,
         input: str,
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ scipy = [`
`60`	`60`	`{ version = "^1.13.0", python = ">=3.9,<3.13" },`
`61`	`61`	`{ version = "^1.15.0", python = ">=3.13,<3.14" }`
`62`	`62`	`]`
	`63`	`+tenacity = "^9.1.2"`
`63`	`64`
`64`	`65`	`[tool.poetry.group.dev.dependencies]`
`65`	`66`	`urllib3 = "<2"`