text input from datastream for v1.0 (#1521)

longcw · theomonnom · commit dbba9900a93e · 2025-02-21T11:41:59.000+01:00
diff --git a/livekit-agents/livekit/agents/llm/realtime.py b/livekit-agents/livekit/agents/llm/realtime.py
@@ -18,7 +18,7 @@ class InputSpeechStartedEvent:
 
 @dataclass
 class InputSpeechStoppedEvent:
-    pass
+    user_transcription_enabled: bool
 
 
 @dataclass
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -255,6 +255,12 @@ def generate_reply(
             allow_interruptions=allow_interruptions,
         )
 
+    def interrupt(self) -> None:
+        if self._activity is None:
+            raise ValueError("PipelineAgent isn't running")
+
+        self._activity.interrupt()
+
     def update_task(self, task: AgentTask) -> None:
         self._agent_task = task
 
diff --git a/livekit-agents/livekit/agents/pipeline/room_io.py b/livekit-agents/livekit/agents/pipeline/room_io.py
@@ -17,6 +17,8 @@
 
 @dataclass(frozen=True)
 class RoomInputOptions:
+    text_enabled: bool = True
+    """Whether to subscribe to text input"""
     audio_enabled: bool = True
     """Whether to subscribe to audio"""
     video_enabled: bool = False
@@ -48,6 +50,7 @@ class RoomOutputOptions:
 DEFAULT_ROOM_INPUT_OPTIONS = RoomInputOptions()
 DEFAULT_ROOM_OUTPUT_OPTIONS = RoomOutputOptions()
 LK_PUBLISH_FOR_ATTR = "lk.publish_for"
+LK_TEXT_INPUT_TOPIC = "lk.room_text_input"
 
 
 class BaseStreamHandle:
@@ -226,6 +229,7 @@ def __init__(
         """
         self._options = options
         self._room = room
+        self._agent: Optional["PipelineAgent"] = None
         self._tasks: set[asyncio.Task] = set()
 
         # target participant
@@ -263,6 +267,12 @@ def __init__(
         for participant in self._room.remote_participants.values():
             self._on_participant_connected(participant)
 
+        # text input from datastream
+        if options.text_enabled:
+            self._room.register_text_stream_handler(
+                LK_TEXT_INPUT_TOPIC, self._on_text_input
+            )
+
     @property
     def audio(self) -> AsyncIterator[rtc.AudioFrame] | None:
         if not self._audio_handle:
@@ -287,7 +297,9 @@ async def start(self, agent: Optional["PipelineAgent"] = None) -> None:
             # link to the first connected participant if not set
             self.set_participant(participant.identity)
 
-        if not agent:
+        # TODO(long): should we force the agent to be set or provide a set_agent method?
+        self._agent = agent
+        if not self._agent:
             return
 
         agent.input.audio = self.audio
@@ -399,6 +411,28 @@ async def _capture_text():
         self._tasks.add(task)
         task.add_done_callback(self._tasks.discard)
 
+    def _on_text_input(
+        self, reader: rtc.TextStreamReader, participant_identity: str
+    ) -> None:
+        if participant_identity != self._participant_identity:
+            return
+
+        async def _read_text():
+            if not self._agent:
+                return
+
+            text = await reader.read_all()
+            logger.debug(
+                "received text input",
+                extra={"text": text, "participant": self._participant_identity},
+            )
+            self._agent.interrupt()
+            self._agent.generate_reply(user_input=text)
+
+        task = asyncio.create_task(_read_text())
+        self._tasks.add(task)
+        task.add_done_callback(self._tasks.discard)
+
     async def aclose(self) -> None:
         self._room.off("participant_connected", self._on_participant_connected)
         self._room.off("participant_disconnected", self._on_participant_disconnected)
diff --git a/livekit-agents/livekit/agents/pipeline/task_activity.py b/livekit-agents/livekit/agents/pipeline/task_activity.py
@@ -319,14 +319,15 @@ def _on_input_speech_started(self, _: llm.InputSpeechStartedEvent) -> None:
         log_event("input_speech_started")
         self.interrupt()  # input_speech_started is also interrupting on the serverside realtime session
 
-    def _on_input_speech_stopped(self, _: llm.InputSpeechStoppedEvent) -> None:
+    def _on_input_speech_stopped(self, ev: llm.InputSpeechStoppedEvent) -> None:
         log_event("input_speech_stopped")
-        self.on_interim_transcript(
-            stt.SpeechEvent(
-                stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                alternatives=[stt.SpeechData(text="", language="")],
+        if ev.user_transcription_enabled:
+            self.on_interim_transcript(
+                stt.SpeechEvent(
+                    stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                    alternatives=[stt.SpeechData(text="", language="")],
+                )
             )
-        )
 
     def _on_input_audio_transcription_completed(
         self, ev: llm.InputTranscriptionCompleted