Adding support for new bot-output RTVI Message:

mattieruth · mattieruth · commit eadb06bf299e · 2025-10-28T15:54:07.000-04:00
1. TTSTextFrames now include metadata about whether the text was spoken or not along with a type string to describe what the text represents: ex. "sentence", "word", "custom aggregation" 2. Expanded how aggregators work so that the aggregate method returns aggregated text along with the type of aggregation used to create it 3. Deprecated the RTVI bot-transcription event in lieu of... 4. Introduced support for a new bot-output event. This event is meant to be the one stop shop for communicating what the bot actually "says". It is based off TTSTextFrames to communicate both sentence by sentence (or whatever aggregation is used) as well as word by word. In addition, it will include LLMTextFrames, aggregated by sentence when tts is turned off (i.e. skip_tts is true). Resolves pipecat-ai/pipecat-client-web#158
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
@@ -351,6 +351,9 @@ class LLMTextFrame(TextFrame):
 class TTSTextFrame(TextFrame):
     """Text frame generated by Text-to-Speech services."""
 
+    aggregated_by: Literal["sentence", "word"] | str
+    spoken: Optional[bool] = True  # Whether this text has been spoken by TTS
+
     pass
 
 
diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
@@ -704,6 +704,29 @@ class RTVITextMessageData(BaseModel):
     text: str
 
 
+class RTVIBotOutputMessageData(RTVITextMessageData):
+    """Data for bot output RTVI messages.
+
+    Extends RTVITextMessageData to include metadata about the output.
+    """
+
+    spoken: bool = True  # Indicates if the text has been spoken by TTS
+    aggregated_by: Optional[Literal["word", "sentence"] | str] = None
+    # Indicates what form the text is in (e.g., by word, sentence, etc.)
+
+
+class RTVIBotOutputMessage(BaseModel):
+    """Message containing bot output text.
+
+    An event meant to wholistically represent what the bot is outputting,
+    along with metadata about the output and if it has been spoken.
+    """
+
+    label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
+    type: Literal["bot-output"] = "bot-output"
+    data: RTVIBotOutputMessageData
+
+
 class RTVIBotTranscriptionMessage(BaseModel):
     """Message containing bot transcription text.
 
@@ -960,6 +983,8 @@ def __init__(
         self._last_user_audio_level = 0
         self._last_bot_audio_level = 0
 
+        self._skip_tts = None
+
         if self._params.system_logs_enabled:
             self._system_logger_id = logger.add(self._logger_sink)
 
@@ -1050,8 +1075,7 @@ async def on_push_frame(self, data: FramePushed):
             await self.send_rtvi_message(RTVIBotTTSStoppedMessage())
         elif isinstance(frame, TTSTextFrame) and self._params.bot_tts_enabled:
             if isinstance(src, BaseOutputTransport):
-                message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
-                await self.send_rtvi_message(message)
+                await self._handle_tts_text_frame(frame)
             else:
                 mark_as_seen = False
         elif isinstance(frame, MetricsFrame) and self._params.metrics_enabled:
@@ -1115,14 +1139,63 @@ async def _handle_bot_speaking(self, frame: Frame):
         if message:
             await self.send_rtvi_message(message)
 
+    async def _handle_tts_text_frame(self, frame: TTSTextFrame):
+        """Handle TTS text output frames."""
+        # send the tts-text message
+        message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
+        await self.send_rtvi_message(message)
+        # send the bot-output message
+        message = RTVIBotOutputMessage(
+            data=RTVIBotOutputMessageData(
+                text=frame.text, spoken=frame.spoken, aggregated_by=frame.aggregated_by
+            )
+        )
+        await self.send_rtvi_message(message)
+
     async def _handle_llm_text_frame(self, frame: LLMTextFrame):
         """Handle LLM text output frames."""
         message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
         await self.send_rtvi_message(message)
 
+        # initialize skip_tts on first LLMTextFrame
+        if self._skip_tts is None:
+            self._skip_tts = frame.skip_tts
+
+        messages = []
+        should_reset_transcription = False
         self._bot_transcription += frame.text
-        if match_endofsentence(self._bot_transcription):
-            await self._push_bot_transcription()
+
+        if not frame.skip_tts and self._skip_tts:
+            # We just switched from skipping TTS to not skipping TTS.
+            # Send and reset any existing transcription.
+            if len(self._bot_transcription) > 0:
+                message.append(
+                    RTVIBotOutputMessage(
+                        data=RTVIBotOutputMessageData(
+                            text=self._bot_transcription, spoken=False, aggregated_by="sentence"
+                        )
+                    )
+                )
+                should_reset_transcription = True
+
+        if match_endofsentence(self._bot_transcription) and len(self._bot_transcription) > 0:
+            messages.append(
+                RTVIBotTranscriptionMessage(data=RTVITextMessageData(text=self._bot_transcription))
+            )
+            if frame.skip_tts:
+                messages.append(
+                    RTVIBotOutputMessage(
+                        data=RTVIBotOutputMessageData(
+                            text=self._bot_transcription, spoken=False, aggregated_by="sentence"
+                        )
+                    )
+                )
+            should_reset_transcription = True
+
+        for msg in messages:
+            await self.send_rtvi_message(msg)
+        if should_reset_transcription:
+            self._bot_transcription = ""
 
     async def _handle_user_transcriptions(self, frame: Frame):
         """Handle user transcription frames."""
diff --git a/src/pipecat/services/aws/nova_sonic/llm.py b/src/pipecat/services/aws/nova_sonic/llm.py
@@ -1027,7 +1027,7 @@ async def _report_assistant_response_text_added(self, text):
         logger.debug(f"Assistant response text added: {text}")
 
         # Report the text of the assistant response.
-        await self.push_frame(TTSTextFrame(text))
+        await self.push_frame(TTSTextFrame(text, aggregated_by="sentence", spoken=True))
 
         # HACK: here we're also buffering the assistant text ourselves as a
         # backup rather than relying solely on the assistant context aggregator
@@ -1060,7 +1060,9 @@ async def _report_assistant_response_ended(self):
                 # TTSTextFrame would be ignored otherwise (the interruption frame
                 # would have cleared the assistant aggregator state).
                 await self.push_frame(LLMFullResponseStartFrame())
-                await self.push_frame(TTSTextFrame(self._assistant_text_buffer))
+                await self.push_frame(
+                    TTSTextFrame(self._assistant_text_buffer, aggregated_by="sentence", spoken=True)
+                )
             self._may_need_repush_assistant_text = False
 
         # Report the end of the assistant response.
diff --git a/src/pipecat/services/google/gemini_live/llm.py b/src/pipecat/services/google/gemini_live/llm.py
@@ -1459,7 +1459,7 @@ async def _handle_msg_output_transcription(self, message: LiveServerMessage):
         self._llm_output_buffer += text
 
         await self.push_frame(LLMTextFrame(text=text))
-        await self.push_frame(TTSTextFrame(text=text))
+        await self.push_frame(TTSTextFrame(text=text, aggregated_by="sentence", spoken=True))
 
     async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
         """Handle dedicated grounding metadata messages."""
diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py
@@ -673,7 +673,7 @@ async def _handle_evt_text_delta(self, evt):
     async def _handle_evt_audio_transcript_delta(self, evt):
         if evt.delta:
             await self.push_frame(LLMTextFrame(evt.delta))
-            await self.push_frame(TTSTextFrame(evt.delta))
+            await self.push_frame(TTSTextFrame(evt.delta, aggregated_by="sentence", spoken=True))
 
     async def _handle_evt_function_call_arguments_done(self, evt):
         """Handle completion of function call arguments.
diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py
@@ -654,7 +654,7 @@ async def _handle_evt_text_delta(self, evt):
     async def _handle_evt_audio_transcript_delta(self, evt):
         if evt.delta:
             await self.push_frame(LLMTextFrame(evt.delta))
-            await self.push_frame(TTSTextFrame(evt.delta))
+            await self.push_frame(TTSTextFrame(evt.delta, aggregated_by="sentence", spoken=True))
 
     async def _handle_evt_speech_started(self, evt):
         await self._truncate_current_audio_response()
diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py
@@ -101,6 +101,8 @@ def __init__(
         sample_rate: Optional[int] = None,
         # Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
         text_aggregator: Optional[BaseTextAggregator] = None,
+        # Types of text aggregations that should not be spoken.
+        skip_aggregator_types: Optional[List[str]] = [],
         # Text filter executed after text has been aggregated.
         text_filters: Optional[Sequence[BaseTextFilter]] = None,
         text_filter: Optional[BaseTextFilter] = None,
@@ -120,6 +122,7 @@ def __init__(
             pause_frame_processing: Whether to pause frame processing during audio generation.
             sample_rate: Output sample rate for generated audio.
             text_aggregator: Custom text aggregator for processing incoming text.
+            skip_aggregator_types: List of aggregation types that should not be spoken.
             text_filters: Sequence of text filters to apply after aggregation.
             text_filter: Single text filter (deprecated, use text_filters).
 
@@ -142,6 +145,7 @@ def __init__(
         self._voice_id: str = ""
         self._settings: Dict[str, Any] = {}
         self._text_aggregator: BaseTextAggregator = text_aggregator or SimpleTextAggregator()
+        self._skip_aggregator_types: List[str] = skip_aggregator_types or []
         self._text_filters: Sequence[BaseTextFilter] = text_filters or []
         self._transport_destination: Optional[str] = transport_destination
         self._tracing_enabled: bool = False
@@ -351,10 +355,14 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # pause to avoid audio overlapping.
             await self._maybe_pause_frame_processing()
 
-            sentence = self._text_aggregator.text
+            aggregate = self._text_aggregator.text
             await self._text_aggregator.reset()
             self._processing_text = False
-            await self._push_tts_frames(sentence)
+            await self._push_tts_frames(
+                text=aggregate.text,
+                should_speak=aggregate.type not in self._skip_aggregator_types,
+                aggregated_by=aggregate.type,
+            )
             if isinstance(frame, LLMFullResponseEndFrame):
                 if self._push_text_frames:
                     await self.push_frame(frame, direction)
@@ -363,7 +371,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, TTSSpeakFrame):
             # Store if we were processing text or not so we can set it back.
             processing_text = self._processing_text
-            await self._push_tts_frames(frame.text)
+            await self._push_tts_frames(frame.text, should_speak=True, aggregated_by="word")
             # We pause processing incoming frames because we are sending data to
             # the TTS. We pause to avoid audio overlapping.
             await self._maybe_pause_frame_processing()
@@ -455,42 +463,53 @@ async def _process_text_frame(self, frame: TextFrame):
         text: Optional[str] = None
         if not self._aggregate_sentences:
             text = frame.text
+            should_speak = True
+            aggregated_by = "token"
         else:
-            text = await self._text_aggregator.aggregate(frame.text)
+            aggregate = await self._text_aggregator.aggregate(frame.text)
+            if aggregate:
+                text = aggregate.text
+                should_speak = aggregate.type not in self._skip_aggregator_types
+                aggregated_by = aggregate.type
 
         if text:
-            await self._push_tts_frames(text)
+            logger.trace(f"Pushing TTS frames for text: {text}, {should_speak}, {aggregated_by}")
+            await self._push_tts_frames(text, should_speak, aggregated_by)
 
-    async def _push_tts_frames(self, text: str):
-        # Remove leading newlines only
-        text = text.lstrip("\n")
+    async def _push_tts_frames(self, text: str, should_speak: bool, aggregated_by: str):
+        if should_speak:
+            # Remove leading newlines only
+            text = text.lstrip("\n")
 
-        # Don't send only whitespace. This causes problems for some TTS models. But also don't
-        # strip all whitespace, as whitespace can influence prosody.
-        if not text.strip():
-            return
+            # Don't send only whitespace. This causes problems for some TTS models. But also don't
+            # strip all whitespace, as whitespace can influence prosody.
+            if not text.strip():
+                return
 
-        # This is just a flag that indicates if we sent something to the TTS
-        # service. It will be cleared if we sent text because of a TTSSpeakFrame
-        # or when we received an LLMFullResponseEndFrame
-        self._processing_text = True
+            # This is just a flag that indicates if we sent something to the TTS
+            # service. It will be cleared if we sent text because of a TTSSpeakFrame
+            # or when we received an LLMFullResponseEndFrame
+            self._processing_text = True
 
-        await self.start_processing_metrics()
+            await self.start_processing_metrics()
 
-        # Process all filter.
-        for filter in self._text_filters:
-            await filter.reset_interruption()
-            text = await filter.filter(text)
+            # Process all filter.
+            for filter in self._text_filters:
+                await filter.reset_interruption()
+                text = await filter.filter(text)
 
-        if text:
-            await self.process_generator(self.run_tts(text))
+            if text:
+                await self.push_frame(TTSTextFrame(text, spoken=True, aggregated_by=aggregated_by))
+                await self.process_generator(self.run_tts(text))
 
-        await self.stop_processing_metrics()
+            await self.stop_processing_metrics()
 
-        if self._push_text_frames:
+        if self._push_text_frames or not should_speak:
             # We send the original text after the audio. This way, if we are
             # interrupted, the text is not added to the assistant context.
-            await self.push_frame(TTSTextFrame(text))
+            await self.push_frame(
+                TTSTextFrame(text, spoken=should_speak, aggregated_by=aggregated_by)
+            )
 
     async def _stop_frame_handler(self):
         has_started = False
@@ -616,7 +635,7 @@ async def _words_task_handler(self):
                 frame = TTSStoppedFrame()
                 frame.pts = last_pts
             else:
-                frame = TTSTextFrame(word)
+                frame = TTSTextFrame(word, spoken=True, aggregated_by="word")
                 frame.pts = self._initial_word_timestamp + timestamp
             if frame:
                 last_pts = frame.pts
diff --git a/src/pipecat/utils/text/base_text_aggregator.py b/src/pipecat/utils/text/base_text_aggregator.py
@@ -12,9 +12,38 @@
 """
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import Optional
 
 
+@dataclass
+class Aggregation:
+    """Data class representing aggregated text and its type.
+
+    An Aggregation object is created whenever a stream of text is aggregated by
+    a text aggregator. It contains the aggregated text and a type indicating
+    the nature of the aggregation.
+    """
+
+    def __init__(self, text: str, type: str):
+        """Initialize an aggregation instance.
+
+        Args:
+            text: The aggregated text content.
+            type: The type of aggregation the text represents (e.g., 'sentence', 'word', 'token', 'my_custom_aggregation').
+        """
+        self.text = text
+        self.type = type
+
+    def __str__(self) -> str:
+        """Return a string representation of the aggregation.
+
+        Returns:
+            A descriptive string showing the type and text of the aggregation.
+        """
+        return f"Aggregation by {self.type}: {self.text}"
+
+
 class BaseTextAggregator(ABC):
     """Base class for text aggregators in the Pipecat framework.
 
@@ -30,7 +59,7 @@ class BaseTextAggregator(ABC):
 
     @property
     @abstractmethod
-    def text(self) -> str:
+    def text(self) -> Aggregation:
         """Get the currently aggregated text.
 
         Subclasses must implement this property to return the text that has
@@ -42,12 +71,13 @@ def text(self) -> str:
         pass
 
     @abstractmethod
-    async def aggregate(self, text: str) -> Optional[str]:
+    async def aggregate(self, text: str) -> Optional[Aggregation]:
         """Aggregate the specified text with the currently accumulated text.
 
         This method should be implemented to define how the new text contributes
-        to the aggregation process. It returns the updated aggregated text if
-        it's ready to be processed, or None otherwise.
+        to the aggregation process. It returns the aggregated text and a string
+        describing how it was aggregated if it's ready to be processed,
+        or None otherwise.
 
         Subclasses should implement their specific logic for:
 
diff --git a/src/pipecat/utils/text/pattern_pair_aggregator.py b/src/pipecat/utils/text/pattern_pair_aggregator.py
diff --git a/src/pipecat/utils/text/simple_text_aggregator.py b/src/pipecat/utils/text/simple_text_aggregator.py
diff --git a/src/pipecat/utils/text/skip_tags_aggregator.py b/src/pipecat/utils/text/skip_tags_aggregator.py
diff --git a/tests/test_transcript_processor.py b/tests/test_transcript_processor.py