@@ -584,36 +584,38 @@ async def _push_tts_frames(self, src_frame: AggregatedTextFrame):
584584 await filter .reset_interruption ()
585585 text = await filter .filter (text )
586586
587- if text :
588- if not self ._push_text_frames :
589- # In a typical pipeline, there is an assistant context aggregator
590- # that listens for TTSTextFrames to add spoken text to the context.
591- # If the TTS service supports word timestamps, then _push_text_frames
592- # is set to False and these are sent word by word as part of the
593- # _words_task_handler in the WordTTSService subclass. However, to
594- # support use cases where an observer may want the full text before
595- # the audio is generated, we send along the AggregatedTextFrame here,
596- # but we set append_to_context to False so it does not cause duplication
597- # in the context. This is primarily used by the RTVIObserver to
598- # generate a complete bot-output.
599- src_frame .append_to_context = False
600- await self .push_frame (src_frame )
601- # Note: Text transformations only affect the text sent to the TTS. This allows
602- # for explicit TTS-specific modifications (e.g., inserting TTS supported tags
603- # for spelling or emotion or replacing an @ with "at"). For TTS services that
604- # support word-level timestamps, this DOES affect the resulting context as the
605- # the context is built from the TTSTextFrames generated during word timestamping.
606- for aggregation_type , transform in self ._text_transforms :
607- if aggregation_type == type or aggregation_type == "*" :
608- text = await transform (text , type )
609- await self .process_generator (self .run_tts (text ))
587+ if not text .strip ():
588+ await self .stop_processing_metrics ()
589+ return
590+
591+ # To support use cases that may want to know the text before it's spoken, we
592+ # push the AggregatedTextFrame version before transforming and sending to TTS.
593+ # However, we do not want to add this text to the assistant context until it
594+ # is spoken, so we set append_to_context to False.
595+ src_frame .append_to_context = False
596+ await self .push_frame (src_frame )
597+
598+ # Note: Text transformations are meant to only affect the text sent to the TTS for
599+ # TTS-specific purposes. This allows for explicit TTS modifications (e.g., inserting
600+ # TTS supported tags for spelling or emotion or replacing an @ with "at"). For TTS
601+ # services that support word-level timestamps, this CAN affect the resulting context
602+ # since the TTSTextFrames are generated from the TTS output stream
603+ transformed_text = text
604+ for aggregation_type , transform in self ._text_transforms :
605+ if aggregation_type == type or aggregation_type == "*" :
606+ transformed_text = await transform (transformed_text , type )
607+ await self .process_generator (self .run_tts (transformed_text ))
610608
611609 await self .stop_processing_metrics ()
612610
613611 if self ._push_text_frames :
614- # In the case where the TTS service does not support word timestamps,
615- # we send the full aggregated text after the audio. This way, if we are
616- # interrupted, the text is not added to the assistant context.
612+ # In TTS services that support word timestamps, the TTSTextFrames
613+ # are pushed as words are spoken. However, in the case where the TTS service
614+ # does not support word timestamps (i.e. _push_text_frames is True), we send
615+ # the original (non-transformed) text after the TTS generation has completed.
616+ # This way, if we are interrupted, the text is not added to the assistant
617+ # context and the context that IS added does not include TTS-specific tags
618+ # or transformations.
617619 frame = TTSTextFrame (text , aggregated_by = type )
618620 frame .includes_inter_frame_spaces = self .includes_inter_frame_spaces
619621 await self .push_frame (frame )
0 commit comments