Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
room_io,
text_transforms,
)
from .voice.agent_session import PreemptiveGenerationOptions
from .voice.amd import (
AMD,
AMDCategory,
Expand Down Expand Up @@ -231,6 +232,7 @@ def __getattr__(name: str) -> typing.Any:
"TurnHandlingOptions",
"EndpointingOptions",
"InterruptionOptions",
"PreemptiveGenerationOptions",
]

# Cleanup docs of unexported modules
Expand Down
17 changes: 16 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._speech_tasks: list[asyncio.Task[Any]] = []

self._preemptive_generation: _PreemptiveGeneration | None = None
self._preemptive_generation_count: int = 0
self._authorization_allowed = asyncio.Event()
self._authorization_allowed.set()

Expand Down Expand Up @@ -1769,8 +1770,9 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No
)

def on_preemptive_generation(self, info: _PreemptiveGenerationInfo) -> None:
preemptive_opts = self._session.options.preemptive_generation
if (
not self._session.options.preemptive_generation
not preemptive_opts
or self._scheduling_paused
or self._new_turns_blocked
or (self._current_speech is not None and not self._current_speech.interrupted)
Expand All @@ -1780,6 +1782,17 @@ def on_preemptive_generation(self, info: _PreemptiveGenerationInfo) -> None:

self._cancel_preemptive_generation()

if (
info.started_speaking_at is not None
and time.time() - info.started_speaking_at > preemptive_opts["max_speech_duration"]
):
return

if self._preemptive_generation_count >= preemptive_opts["max_retries"]:
return

self._preemptive_generation_count += 1
Comment on lines 1783 to +1794
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Existing preemptive generation is cancelled before max_retries check, discarding valid work

In on_preemptive_generation, _cancel_preemptive_generation() is called unconditionally on line 1783 before the max_retries check on line 1791. When _preemptive_generation_count >= max_retries, the method returns early without starting a new generation — but the previous (most recent) preemptive generation has already been cancelled and set to None. This means the last successful preemptive generation is destroyed without replacement. Later, in _user_turn_completed_task at line 1995, self._preemptive_generation is None, so the preemptive result can never be used and a fresh (non-preemptive) LLM call is always made instead. This defeats the purpose of the max_retries limit, which should keep the last generation alive when retries are exhausted.

The fix is to move _cancel_preemptive_generation() after the early-return checks (or at least after the max_retries check), so the existing generation is only cancelled when it will actually be replaced by a new one.

Suggested change
self._cancel_preemptive_generation()
if (
info.started_speaking_at is not None
and time.time() - info.started_speaking_at > preemptive_opts["max_speech_duration"]
):
return
if self._preemptive_generation_count >= preemptive_opts["max_retries"]:
return
self._preemptive_generation_count += 1
if (
info.started_speaking_at is not None
and time.time() - info.started_speaking_at > preemptive_opts["max_speech_duration"]
):
self._cancel_preemptive_generation()
return
if self._preemptive_generation_count >= preemptive_opts["max_retries"]:
return
self._cancel_preemptive_generation()
self._preemptive_generation_count += 1
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is expected, on_preemptive_generation is called when user transcript changed, so the previous preemptive generation is invalid, we should cancel it asap.


user_message = llm.ChatMessage(
role="user",
content=[info.new_transcript],
Expand Down Expand Up @@ -1861,6 +1874,8 @@ async def _user_turn_completed_task(
# is detected. So the previous execution should complete quickly.
await old_task

self._preemptive_generation_count = 0

# When the audio recognition detects the end of a user turn:
# - check if realtime model server-side turn detection is enabled
# - check if there is no current generation happening
Expand Down
38 changes: 34 additions & 4 deletions livekit-agents/livekit/agents/voice/agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,34 @@ def _resolve_recording_options(record: bool | RecordingOptions) -> RecordingOpti
return RecordingOptions(**{**_RECORDING_ALL_ON, **record})


class PreemptiveGenerationOptions(TypedDict, total=False):
"""Configuration for preemptive generation."""

max_speech_duration: float
"""Maximum user speech duration (s) for which preemptive generation
is attempted. Beyond this threshold, preemptive generation is skipped
since long utterances are more likely to change and users may expect
slower responses. Defaults to ``10.0``."""

max_retries: int
"""Maximum number of preemptive generation attempts per user turn.
The counter resets when the turn completes. Defaults to ``3``."""


_PREEMPTIVE_GENERATION_DEFAULTS: PreemptiveGenerationOptions = {
"max_speech_duration": 10.0,
"max_retries": 3,
}


def _resolve_preemptive_generation(
config: bool | PreemptiveGenerationOptions,
) -> PreemptiveGenerationOptions | None:
if isinstance(config, bool):
return PreemptiveGenerationOptions(**_PREEMPTIVE_GENERATION_DEFAULTS) if config else None
return PreemptiveGenerationOptions(**{**_PREEMPTIVE_GENERATION_DEFAULTS, **config})


@dataclass
class SessionConnectOptions:
stt_conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
Expand All @@ -135,7 +163,7 @@ class AgentSessionOptions:
turn_handling: TurnHandlingOptions
max_tool_steps: int
user_away_timeout: float | None
preemptive_generation: bool
preemptive_generation: PreemptiveGenerationOptions | None
min_consecutive_speech_delay: float
use_tts_aligned_transcript: bool | None
tts_text_transforms: Sequence[TextTransforms] | None
Expand Down Expand Up @@ -226,7 +254,7 @@ def __init__(
# Misc settings
userdata: NotGivenOr[Userdata_T] = NOT_GIVEN,
video_sampler: NotGivenOr[_VideoSampler | None] = NOT_GIVEN,
preemptive_generation: bool = True,
preemptive_generation: bool | PreemptiveGenerationOptions = True,
aec_warmup_duration: float | None = 3.0,
ivr_detection: bool = False,
user_away_timeout: float | None = 15.0,
Expand Down Expand Up @@ -292,12 +320,14 @@ def __init__(
user_away_timeout (float, optional): If set, set the user state as
"away" after this amount of time after user and agent are silent.
Defaults to ``15.0`` s, set to ``None`` to disable.
preemptive_generation (bool):
preemptive_generation (bool | PreemptiveGenerationOptions):
Whether to speculatively begin LLM and TTS requests before an end-of-turn is
detected. When True, the agent sends inference calls as soon as a user
transcript is received rather than waiting for a definitive turn boundary. This
can reduce response latency by overlapping model inference with user audio,
but may incur extra compute if the user interrupts or revises mid-utterance.
Pass a ``PreemptiveGenerationOptions`` dict for fine-grained control
(e.g. ``{"max_speech_duration": 5.0}``).
Defaults to ``True``.
aec_warmup_duration (float, optional): The duration in seconds that the agent
will ignore user's audio interruptions after the agent starts speaking.
Expand Down Expand Up @@ -358,7 +388,7 @@ def __init__(
),
max_tool_steps=max_tool_steps,
user_away_timeout=user_away_timeout,
preemptive_generation=preemptive_generation,
preemptive_generation=_resolve_preemptive_generation(preemptive_generation),
min_consecutive_speech_delay=min_consecutive_speech_delay,
tts_text_transforms=(
tts_text_transforms
Expand Down
Loading