@@ -101,6 +101,8 @@ def __init__(
101101 sample_rate : Optional [int ] = None ,
102102 # Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
103103 text_aggregator : Optional [BaseTextAggregator ] = None ,
104+ # Types of text aggregations that should not be spoken.
105+ skip_aggregator_types : Optional [List [str ]] = [],
104106 # Text filter executed after text has been aggregated.
105107 text_filters : Optional [Sequence [BaseTextFilter ]] = None ,
106108 text_filter : Optional [BaseTextFilter ] = None ,
@@ -120,6 +122,7 @@ def __init__(
120122 pause_frame_processing: Whether to pause frame processing during audio generation.
121123 sample_rate: Output sample rate for generated audio.
122124 text_aggregator: Custom text aggregator for processing incoming text.
125+ skip_aggregator_types: List of aggregation types that should not be spoken.
123126 text_filters: Sequence of text filters to apply after aggregation.
124127 text_filter: Single text filter (deprecated, use text_filters).
125128
@@ -142,6 +145,7 @@ def __init__(
142145 self ._voice_id : str = ""
143146 self ._settings : Dict [str , Any ] = {}
144147 self ._text_aggregator : BaseTextAggregator = text_aggregator or SimpleTextAggregator ()
148+ self ._skip_aggregator_types : List [str ] = skip_aggregator_types or []
145149 self ._text_filters : Sequence [BaseTextFilter ] = text_filters or []
146150 self ._transport_destination : Optional [str ] = transport_destination
147151 self ._tracing_enabled : bool = False
@@ -351,10 +355,14 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
351355 # pause to avoid audio overlapping.
352356 await self ._maybe_pause_frame_processing ()
353357
354- sentence = self ._text_aggregator .text
358+ aggregate = self ._text_aggregator .text
355359 await self ._text_aggregator .reset ()
356360 self ._processing_text = False
357- await self ._push_tts_frames (sentence )
361+ await self ._push_tts_frames (
362+ text = aggregate .text ,
363+ should_speak = aggregate .type not in self ._skip_aggregator_types ,
364+ aggregated_by = aggregate .type ,
365+ )
358366 if isinstance (frame , LLMFullResponseEndFrame ):
359367 if self ._push_text_frames :
360368 await self .push_frame (frame , direction )
@@ -363,7 +371,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
363371 elif isinstance (frame , TTSSpeakFrame ):
364372 # Store if we were processing text or not so we can set it back.
365373 processing_text = self ._processing_text
366- await self ._push_tts_frames (frame .text )
374+ await self ._push_tts_frames (frame .text , should_speak = True , aggregated_by = "word" )
367375 # We pause processing incoming frames because we are sending data to
368376 # the TTS. We pause to avoid audio overlapping.
369377 await self ._maybe_pause_frame_processing ()
@@ -455,42 +463,53 @@ async def _process_text_frame(self, frame: TextFrame):
455463 text : Optional [str ] = None
456464 if not self ._aggregate_sentences :
457465 text = frame .text
466+ should_speak = True
467+ aggregated_by = "token"
458468 else :
459- text = await self ._text_aggregator .aggregate (frame .text )
469+ aggregate = await self ._text_aggregator .aggregate (frame .text )
470+ if aggregate :
471+ text = aggregate .text
472+ should_speak = aggregate .type not in self ._skip_aggregator_types
473+ aggregated_by = aggregate .type
460474
461475 if text :
462- await self ._push_tts_frames (text )
476+ logger .trace (f"Pushing TTS frames for text: { text } , { should_speak } , { aggregated_by } " )
477+ await self ._push_tts_frames (text , should_speak , aggregated_by )
463478
464- async def _push_tts_frames (self , text : str ):
465- # Remove leading newlines only
466- text = text .lstrip ("\n " )
479+ async def _push_tts_frames (self , text : str , should_speak : bool , aggregated_by : str ):
480+ if should_speak :
481+ # Remove leading newlines only
482+ text = text .lstrip ("\n " )
467483
468- # Don't send only whitespace. This causes problems for some TTS models. But also don't
469- # strip all whitespace, as whitespace can influence prosody.
470- if not text .strip ():
471- return
484+ # Don't send only whitespace. This causes problems for some TTS models. But also don't
485+ # strip all whitespace, as whitespace can influence prosody.
486+ if not text .strip ():
487+ return
472488
473- # This is just a flag that indicates if we sent something to the TTS
474- # service. It will be cleared if we sent text because of a TTSSpeakFrame
475- # or when we received an LLMFullResponseEndFrame
476- self ._processing_text = True
489+ # This is just a flag that indicates if we sent something to the TTS
490+ # service. It will be cleared if we sent text because of a TTSSpeakFrame
491+ # or when we received an LLMFullResponseEndFrame
492+ self ._processing_text = True
477493
478- await self .start_processing_metrics ()
494+ await self .start_processing_metrics ()
479495
480- # Process all filter.
481- for filter in self ._text_filters :
482- await filter .reset_interruption ()
483- text = await filter .filter (text )
496+ # Process all filter.
497+ for filter in self ._text_filters :
498+ await filter .reset_interruption ()
499+ text = await filter .filter (text )
484500
485- if text :
486- await self .process_generator (self .run_tts (text ))
501+ if text :
502+ await self .push_frame (TTSTextFrame (text , spoken = True , aggregated_by = aggregated_by ))
503+ await self .process_generator (self .run_tts (text ))
487504
488- await self .stop_processing_metrics ()
505+ await self .stop_processing_metrics ()
489506
490- if self ._push_text_frames :
507+ if self ._push_text_frames or not should_speak :
491508 # We send the original text after the audio. This way, if we are
492509 # interrupted, the text is not added to the assistant context.
493- await self .push_frame (TTSTextFrame (text ))
510+ await self .push_frame (
511+ TTSTextFrame (text , spoken = should_speak , aggregated_by = aggregated_by )
512+ )
494513
495514 async def _stop_frame_handler (self ):
496515 has_started = False
@@ -616,7 +635,7 @@ async def _words_task_handler(self):
616635 frame = TTSStoppedFrame ()
617636 frame .pts = last_pts
618637 else :
619- frame = TTSTextFrame (word )
638+ frame = TTSTextFrame (word , spoken = True , aggregated_by = "word" )
620639 frame .pts = self ._initial_word_timestamp + timestamp
621640 if frame :
622641 last_pts = frame .pts
0 commit comments