Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions core/llm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,19 @@ async def __call__(
request_log.status = LLMRequestStatus.ERROR
wait_time = self.rate_limit_sleep(err)
if wait_time:
message = f"We've hit {self.config.provider.value} rate limit. Sleeping for {wait_time.seconds} seconds..."
# Use total_seconds() instead of .seconds to correctly
# handle timedeltas that span days (e.g. daily rate limits).
# The .seconds attribute only returns the seconds component
# within the current day, which can be 0 for timedelta(days=1)
# or ~86400 for small negative timedeltas.
sleep_seconds = int(wait_time.total_seconds())
# Clamp to a reasonable range: at least 1 second, at most 1 hour
sleep_seconds = max(1, min(sleep_seconds, 3600))
message = f"We've hit {self.config.provider.value} rate limit. Sleeping for {sleep_seconds} seconds..."
log.info(message)
if self.error_handler:
await self.error_handler(LLMError.RATE_LIMITED, message)
await asyncio.sleep(wait_time.seconds)
await asyncio.sleep(sleep_seconds)
continue
else:
# RateLimitError that shouldn't be retried, eg. insufficient funds
Expand Down
141 changes: 141 additions & 0 deletions tests/llm/test_rate_limit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
Tests for rate limit sleep duration handling in BaseLLMClient.

Verifies that the sleep duration is correctly computed using
total_seconds() and clamped to a reasonable range (1s to 3600s).
"""

import datetime
from unittest.mock import AsyncMock, MagicMock, patch

import openai
import pytest

from core.config import LLMConfig
from core.llm.convo import Convo
from core.llm.openai_client import OpenAIClient
from core.state.state_manager import StateManager


async def mock_response_generator(*content):
for item in content:
chunk = MagicMock()
chunk.choices = [MagicMock(delta=MagicMock(content=item))]
yield chunk


@pytest.mark.asyncio
@pytest.mark.parametrize(
("sleep_timedelta", "expected_sleep"),
[
# Normal case: 11 seconds from rate limit header
(datetime.timedelta(seconds=11), 11),
# Large value from OpenAI: 2 hours should be capped at 3600
(datetime.timedelta(seconds=7200), 3600),
# Anthropic daily rate limit: 24h becomes timedelta(days=1),
# .seconds would be 0 but total_seconds() is 86400, capped to 3600
(datetime.timedelta(days=1), 3600),
# Negative timedelta (clock drift): .seconds would be 86395
# but total_seconds() is -5, clamped to minimum of 1
(datetime.timedelta(seconds=-5), 1),
# Small positive timedelta
(datetime.timedelta(seconds=1), 1),
# Edge case: exactly 1 hour
(datetime.timedelta(hours=1), 3600),
# Fractional seconds (e.g. 11.276s from OpenAI message)
(datetime.timedelta(seconds=11.276), 11),
],
)
@patch("core.cli.helpers.StateManager")
@patch("core.llm.openai_client.AsyncOpenAI")
async def test_rate_limit_sleep_duration_is_clamped(
mock_AsyncOpenAI, mock_state_manager, sleep_timedelta, expected_sleep
):
"""
Test that the rate limit sleep duration correctly uses total_seconds()
and is clamped between 1 and 3600 seconds.
"""
cfg = LLMConfig(model="gpt-4-turbo")
convo = Convo("system hello").user("user hello")

sm = StateManager(mock_state_manager)

# Set up mock chain
mock_chat = AsyncMock()
mock_completions = AsyncMock()
mock_chat.completions = mock_completions
mock_client = AsyncMock()
mock_client.chat = mock_chat
mock_AsyncOpenAI.return_value = mock_client

llm = OpenAIClient(cfg, state_manager=sm)

# Mock rate_limit_sleep to return the test timedelta
llm.rate_limit_sleep = MagicMock(return_value=sleep_timedelta)

# First call raises RateLimitError, second call succeeds
mock_response = MagicMock()
mock_response.headers = {}
mock_response.json.return_value = {"error": {"message": "rate limited"}}

rate_limit_err = openai.RateLimitError(
message="Rate limit exceeded",
response=mock_response,
body={"error": {"message": "rate limited"}},
)

llm._make_request = AsyncMock(
side_effect=[
rate_limit_err,
("success", 10, 5),
]
)

with patch("core.llm.base.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
response, req_log = await llm(convo)

mock_sleep.assert_awaited_once_with(expected_sleep)
assert response == "success"


@pytest.mark.asyncio
@patch("core.cli.helpers.StateManager")
@patch("core.llm.openai_client.AsyncOpenAI")
async def test_rate_limit_none_raises_api_error(mock_AsyncOpenAI, mock_state_manager):
"""
Test that when rate_limit_sleep returns None (e.g. insufficient funds),
the error is raised instead of retrying.
"""
cfg = LLMConfig(model="gpt-4-turbo")
convo = Convo("system hello").user("user hello")

sm = StateManager(mock_state_manager)

mock_chat = AsyncMock()
mock_completions = AsyncMock()
mock_chat.completions = mock_completions
mock_client = AsyncMock()
mock_client.chat = mock_chat
mock_AsyncOpenAI.return_value = mock_client

llm = OpenAIClient(cfg, state_manager=sm)

# rate_limit_sleep returns None = do not retry
llm.rate_limit_sleep = MagicMock(return_value=None)

mock_response = MagicMock()
mock_response.headers = {}
mock_response.json.return_value = {"error": {"message": "Insufficient funds"}}

rate_limit_err = openai.RateLimitError(
message="Rate limit exceeded",
response=mock_response,
body={"error": {"message": "Insufficient funds"}},
)

llm._make_request = AsyncMock(side_effect=rate_limit_err)

from core.llm.base import APIError

with pytest.raises(APIError, match="Insufficient funds"):
await llm(convo)