Move aggregation logic when skip_tts is on to the assistant aggregator

mattieruth · mattieruth · commit 5d0a355f710b · 2025-10-30T14:10:56.000-04:00
diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
@@ -22,7 +22,6 @@
 from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import (
-    AggregatedLLMTextFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py
@@ -23,6 +23,7 @@
 from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import (
+    AggregatedLLMTextFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
@@ -46,6 +47,7 @@
     LLMRunFrame,
     LLMSetToolChoiceFrame,
     LLMSetToolsFrame,
+    LLMTextFrame,
     SpeechControlParamsFrame,
     StartFrame,
     TextFrame,
@@ -65,6 +67,7 @@
     LLMUserAggregatorParams,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.utils.string import match_endofsentence
 from pipecat.utils.time import time_now_iso8601
 
 
@@ -565,6 +568,9 @@ def __init__(
         self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
         self._context_updated_tasks: Set[asyncio.Task] = set()
 
+        self._llm_aggregation: str = ""
+        self._skip_tts: Optional[bool] = None
+
     @property
     def has_function_calls_in_progress(self) -> bool:
         """Check if there are any function calls currently in progress.
@@ -588,6 +594,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             await self.push_frame(frame, direction)
         elif isinstance(frame, LLMFullResponseStartFrame):
             await self._handle_llm_start(frame)
+        elif isinstance(frame, LLMTextFrame):
+            await self._handle_llm_text(frame)
         elif isinstance(frame, LLMFullResponseEndFrame):
             await self._handle_llm_end(frame)
         elif isinstance(frame, TextFrame):
@@ -787,12 +795,50 @@ async def _handle_user_image_frame(self, frame: UserImageRawFrame):
         await self.push_aggregation()
         await self.push_context_frame(FrameDirection.UPSTREAM)
 
-    async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
+    async def _handle_llm_start(self, frame: LLMFullResponseStartFrame):
         self._started += 1
+        if self._skip_tts is None:
+            self._skip_tts = frame.skip_tts
+        await self._maybe_push_llm_aggregation(frame)
+
+    async def _handle_llm_text(self, frame: LLMTextFrame):
+        await self._handle_text(frame)
+        if self._skip_tts or frame.skip_tts:
+            self._llm_aggregation += frame.text
+        await self._maybe_push_llm_aggregation(frame)
 
-    async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
+    async def _handle_llm_end(self, frame: LLMFullResponseEndFrame):
         self._started -= 1
         await self.push_aggregation()
+        await self._maybe_push_llm_aggregation(frame)
+
+    async def _maybe_push_llm_aggregation(
+        self, frame: LLMFullResponseStartFrame | LLMTextFrame | LLMFullResponseEndFrame
+    ):
+        should_push = False
+        if self._skip_tts and not frame.skip_tts:
+            # if the skip_tts flag switches, to false, push the current aggregation
+            should_push = True
+        self._skip_tts = frame.skip_tts
+        if self._skip_tts:
+            if self._skip_tts and isinstance(frame, LLMFullResponseEndFrame):
+                # on end frame, always push the aggregation
+                should_push = True
+            elif len(self._llm_aggregation) > 0 and match_endofsentence(self._llm_aggregation):
+                # push aggregation on end of sentence
+                should_push = True
+
+        if not should_push:
+            return
+
+        text = self._llm_aggregation.lstrip("\n")
+        if not text.strip():
+            # don't push empty text
+            return
+
+        llm_frame = AggregatedLLMTextFrame(text=text, aggregated_by="sentence")
+        await self.push_frame(llm_frame)
+        self._llm_aggregation = ""
 
     async def _handle_text(self, frame: TextFrame):
         if not self._started or not frame.append_to_context:
diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
@@ -984,8 +984,6 @@ def __init__(
         self._last_user_audio_level = 0
         self._last_bot_audio_level = 0
 
-        self._skip_tts = None
-
         if self._params.system_logs_enabled:
             self._system_logger_id = logger.add(self._logger_sink)
 
@@ -1024,16 +1022,6 @@ async def send_rtvi_message(self, model: BaseModel, exclude_none: bool = True):
         if self._rtvi:
             await self._rtvi.push_transport_message(model, exclude_none)
 
-    async def send_aggregated_llm_text(self, text: str, aggregated_by: Optional[str] = None):
-        """Send aggregated LLM text as a bot output message.
-
-        Args:
-            text: The aggregated text to send.
-            aggregated_by: The method of aggregation (e.g., "word", "sentence").
-        """
-        if self._rtvi:
-            await self._rtvi.push_aggregated_llm_text(text, aggregated_by)
-
     async def on_push_frame(self, data: FramePushed):
         """Process a frame being pushed through the pipeline.
 
@@ -1171,30 +1159,14 @@ async def _handle_llm_text_frame(self, frame: LLMTextFrame):
         message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
         await self.send_rtvi_message(message)
 
-        # initialize skip_tts on first LLMTextFrame
-        if self._skip_tts is None:
-            self._skip_tts = frame.skip_tts
-
-        orig_text = self._bot_transcription
+        # TODO: Remove all this logic when we fully deprecate bot-transcription messages.
         self._bot_transcription += frame.text
 
         if match_endofsentence(self._bot_transcription) and len(self._bot_transcription) > 0:
-            # TODO: Remove this message when we fully deprecate bot-transcription messages.
             await self.send_rtvi_message(
                 RTVIBotTranscriptionMessage(data=RTVITextMessageData(text=self._bot_transcription))
             )
-            if frame.skip_tts:
-                await self.send_aggregated_llm_text(
-                    text=self._bot_transcription, aggregated_by="sentence"
-                )
             self._bot_transcription = ""
-        elif not frame.skip_tts and self._skip_tts:
-            # We just switched from skipping TTS to not skipping TTS.
-            # Send any dangling transcription.
-            if len(orig_text) > 0:
-                await self.send_aggregated_llm_text(text=orig_text, aggregated_by="sentence")
-                self._bot_transcription = frame.text
-        self._skip_tts = frame.skip_tts
 
     async def _handle_user_transcriptions(self, frame: Frame):
         """Handle user transcription frames."""
@@ -1424,12 +1396,6 @@ async def push_transport_message(self, model: BaseModel, exclude_none: bool = Tr
         )
         await self.push_frame(frame)
 
-    async def push_aggregated_llm_text(self, text: str, aggregated_by: Optional[str] = None):
-        """Push an aggregated LLM text frame."""
-        frame = AggregatedLLMTextFrame(text=text, aggregated_by=aggregated_by)
-        frame.skip_tts = True
-        await self.push_frame(frame)
-
     async def handle_message(self, message: RTVIMessage):
         """Handle an incoming RTVI message.