Final PR Feedback changes

mattieruth · mattieruth · commit 713b488bb65e · 2025-11-14T13:54:20.000-05:00
diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py
@@ -584,36 +584,38 @@ async def _push_tts_frames(self, src_frame: AggregatedTextFrame):
             await filter.reset_interruption()
             text = await filter.filter(text)
 
-        if text:
-            if not self._push_text_frames:
-                # In a typical pipeline, there is an assistant context aggregator
-                # that listens for TTSTextFrames to add spoken text to the context.
-                # If the TTS service supports word timestamps, then _push_text_frames
-                # is set to False and these are sent word by word as part of the
-                # _words_task_handler in the WordTTSService subclass. However, to
-                # support use cases where an observer may want the full text before
-                # the audio is generated, we send along the AggregatedTextFrame here,
-                # but we set append_to_context to False so it does not cause duplication
-                # in the context. This is primarily used by the RTVIObserver to
-                # generate a complete bot-output.
-                src_frame.append_to_context = False
-                await self.push_frame(src_frame)
-            # Note: Text transformations only affect the text sent to the TTS. This allows
-            # for explicit TTS-specific modifications (e.g., inserting TTS supported tags
-            # for spelling or emotion or replacing an @ with "at"). For TTS services that
-            # support word-level timestamps, this DOES affect the resulting context as the
-            # the context is built from the TTSTextFrames generated during word timestamping.
-            for aggregation_type, transform in self._text_transforms:
-                if aggregation_type == type or aggregation_type == "*":
-                    text = await transform(text, type)
-            await self.process_generator(self.run_tts(text))
+        if not text.strip():
+            await self.stop_processing_metrics()
+            return
+
+        # To support use cases that may want to know the text before it's spoken, we
+        # push the AggregatedTextFrame version before transforming and sending to TTS.
+        # However, we do not want to add this text to the assistant context until it
+        # is spoken, so we set append_to_context to False.
+        src_frame.append_to_context = False
+        await self.push_frame(src_frame)
+
+        # Note: Text transformations are meant to only affect the text sent to the TTS for
+        # TTS-specific purposes. This allows for explicit TTS modifications (e.g., inserting
+        # TTS supported tags for spelling or emotion or replacing an @ with "at"). For TTS
+        # services that support word-level timestamps, this CAN affect the resulting context
+        # since the TTSTextFrames are generated from the TTS output stream
+        transformed_text = text
+        for aggregation_type, transform in self._text_transforms:
+            if aggregation_type == type or aggregation_type == "*":
+                transformed_text = await transform(transformed_text, type)
+        await self.process_generator(self.run_tts(transformed_text))
 
         await self.stop_processing_metrics()
 
         if self._push_text_frames:
-            # In the case where the TTS service does not support word timestamps,
-            # we send the full aggregated text after the audio. This way, if we are
-            # interrupted, the text is not added to the assistant context.
+            # In TTS services that support word timestamps, the TTSTextFrames
+            # are pushed as words are spoken. However, in the case where the TTS service
+            # does not support word timestamps (i.e. _push_text_frames is True), we send
+            # the original (non-transformed) text after the TTS generation has completed.
+            # This way, if we are interrupted, the text is not added to the assistant
+            # context and the context that IS added does not include TTS-specific tags
+            # or transformations.
             frame = TTSTextFrame(text, aggregated_by=type)
             frame.includes_inter_frame_spaces = self.includes_inter_frame_spaces
             await self.push_frame(frame)
diff --git a/src/pipecat/utils/text/base_text_aggregator.py b/src/pipecat/utils/text/base_text_aggregator.py
@@ -13,9 +13,20 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from enum import Enum
 from typing import Optional
 
 
+class AggregationType(str, Enum):
+    """Built-in aggregation strings."""
+
+    SENTENCE = "sentence"
+    WORD = "word"
+
+    def __str__(self):
+        return self.value
+
+
 @dataclass
 class Aggregation:
     """Data class representing aggregated text and its type.
diff --git a/src/pipecat/utils/text/pattern_pair_aggregator.py b/src/pipecat/utils/text/pattern_pair_aggregator.py
@@ -18,7 +18,7 @@
 from loguru import logger
 
 from pipecat.utils.string import match_endofsentence
-from pipecat.utils.text.base_text_aggregator import Aggregation, BaseTextAggregator
+from pipecat.utils.text.base_text_aggregator import Aggregation, AggregationType, BaseTextAggregator
 
 
 class MatchAction(Enum):
@@ -110,8 +110,8 @@ def text(self) -> Aggregation:
         """
         pattern_start = self._match_start_of_pattern(self._text)
         if pattern_start:
-            return Aggregation(self._text, pattern_start[1].get("type", "sentence"))
-        return Aggregation(self._text, "sentence")
+            return Aggregation(self._text, pattern_start[1].get("type", AggregationType.SENTENCE))
+        return Aggregation(self._text, AggregationType.SENTENCE)
 
     def add_pattern(
         self,
@@ -128,8 +128,8 @@ def add_pattern(
 
         Args:
             type: Identifier for this pattern pair. Should be unique and ideally descriptive.
-                  (e.g., 'code', 'speaker', 'custom'). type can not be 'sentence' as that is
-                  reserved for the default behavior.
+                  (e.g., 'code', 'speaker', 'custom'). type can not be 'sentence' or 'word' as
+                  those are reserved for the default behavior.
             start_pattern: Pattern that marks the beginning of content.
             end_pattern: Pattern that marks the end of content.
             action: What to do when a complete pattern is matched:
@@ -143,9 +143,9 @@ def add_pattern(
         Returns:
             Self for method chaining.
         """
-        if type == "sentence":
+        if type in [AggregationType.SENTENCE, AggregationType.WORD]:
             raise ValueError(
-                "The aggregation type 'sentence' is reserved for default behavior and can not be used for custom patterns."
+                f"The aggregation type '{type}' is reserved for default behavior and can not be used for custom patterns."
             )
         self._patterns[type] = {
             "start": start_pattern,
@@ -169,8 +169,8 @@ def add_pattern_pair(
 
         Args:
             pattern_id: Identifier for this pattern pair. Should be unique and ideally descriptive.
-                        (e.g., 'code', 'speaker', 'custom'). pattern_id can not be 'sentence' as that is
-                        reserved for the default behavior.
+                        (e.g., 'code', 'speaker', 'custom'). pattern_id can not be 'sentence' or 'word'
+                        as those arereserved for the default behavior.
             start_pattern: Pattern that marks the beginning of content.
             end_pattern: Pattern that marks the end of content.
             remove_match: If True, the matched pattern will be removed from the text. (Same as MatchAction.REMOVE)
@@ -345,15 +345,15 @@ async def aggregate(self, text: str) -> Optional[PatternMatch]:
             # Otherwise, strip the text up to the start pattern and return it
             result = self._text[: pattern_start[0]]
             self._text = self._text[pattern_start[0] :]
-            return PatternMatch(content=result, type="sentence", full_match=result)
+            return PatternMatch(content=result, type=AggregationType.SENTENCE, full_match=result)
 
         # Find sentence boundary if no incomplete patterns
         eos_marker = match_endofsentence(self._text)
         if eos_marker:
             # Extract text up to the sentence boundary
             result = self._text[:eos_marker]
             self._text = self._text[eos_marker:]
-            return PatternMatch(content=result, type="sentence", full_match=result)
+            return PatternMatch(content=result, type=AggregationType.SENTENCE, full_match=result)
 
         # No complete sentence found yet
         return None
diff --git a/src/pipecat/utils/text/simple_text_aggregator.py b/src/pipecat/utils/text/simple_text_aggregator.py
@@ -14,7 +14,7 @@
 from typing import Optional
 
 from pipecat.utils.string import match_endofsentence
-from pipecat.utils.text.base_text_aggregator import Aggregation, BaseTextAggregator
+from pipecat.utils.text.base_text_aggregator import Aggregation, AggregationType, BaseTextAggregator
 
 
 class SimpleTextAggregator(BaseTextAggregator):
@@ -39,7 +39,7 @@ def text(self) -> Aggregation:
         Returns:
             The text that has been accumulated in the buffer.
         """
-        return Aggregation(self._text, "sentence")
+        return Aggregation(self._text, AggregationType.SENTENCE)
 
     async def aggregate(self, text: str) -> Optional[Aggregation]:
         """Aggregate text and return completed sentences.
@@ -64,7 +64,7 @@ async def aggregate(self, text: str) -> Optional[Aggregation]:
             result = self._text[:eos_end_marker]
             self._text = self._text[eos_end_marker:]
 
-        return Aggregation(result, "sentence") if result else None
+        return Aggregation(result, AggregationType.SENTENCE) if result else None
 
     async def handle_interruption(self):
         """Handle interruptions by clearing the text buffer.
diff --git a/src/pipecat/utils/text/skip_tags_aggregator.py b/src/pipecat/utils/text/skip_tags_aggregator.py
@@ -14,7 +14,7 @@
 from typing import Optional, Sequence
 
 from pipecat.utils.string import StartEndTags, match_endofsentence, parse_start_end_tags
-from pipecat.utils.text.base_text_aggregator import Aggregation, BaseTextAggregator
+from pipecat.utils.text.base_text_aggregator import Aggregation, AggregationType, BaseTextAggregator
 
 
 class SkipTagsAggregator(BaseTextAggregator):
@@ -49,7 +49,7 @@ def text(self) -> str:
         Returns:
             The current text buffer content that hasn't been processed yet.
         """
-        return Aggregation(self._text, "sentence")
+        return Aggregation(self._text, AggregationType.SENTENCE)
 
     async def aggregate(self, text: str) -> Optional[Aggregation]:
         """Aggregate text while respecting tag boundaries.
@@ -80,7 +80,7 @@ async def aggregate(self, text: str) -> Optional[Aggregation]:
                 # Extract text up to the sentence boundary
                 result = self._text[:eos_marker]
                 self._text = self._text[eos_marker:]
-                return Aggregation(result, "sentence")
+                return Aggregation(result, AggregationType.SENTENCE)
 
         # No complete sentence found yet
         return None