add better word alignment for Cartesia (#3876)

chenghao-mou · web-flow · commit a18d5e9f231f · 2025-11-11T17:45:24.000Z
Two irrelevant checks failed, skipping them for now.
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -19,6 +19,7 @@
 import json
 import os
 import weakref
+from collections import deque
 from dataclasses import dataclass, replace
 from typing import Any, Union, cast
 
@@ -171,6 +172,19 @@ def __init__(
         elif isinstance(text_pacing, tts.SentenceStreamPacer):
             self._stream_pacer = text_pacing
 
+        if word_timestamps:
+            if "preview" not in self._opts.model and self._opts.language not in {
+                "en",
+                "de",
+                "es",
+                "fr",
+            }:
+                # https://docs.cartesia.ai/api-reference/tts/compare-tts-endpoints
+                logger.warning(
+                    "word_timestamps is only supported for languages en, de, es, and fr with `sonic` models"
+                    " or all languages with `preview` models"
+                )
+
     @property
     def model(self) -> str:
         return self._opts.model
@@ -348,6 +362,7 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
             stream=True,
         )
         input_sent_event = asyncio.Event()
+        sent_tokens = deque[str]()
 
         sent_tokenizer_stream = self._tts._sentence_tokenizer.stream()
         if self._tts._stream_pacer:
@@ -363,6 +378,7 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                 token_pkt = base_pkt.copy()
                 token_pkt["context_id"] = context_id
                 token_pkt["transcript"] = ev.token + " "
+                sent_tokens.append(ev.token + " ")
                 token_pkt["continue"] = True
                 self._mark_started()
                 await ws.send_str(json.dumps(token_pkt))
@@ -371,6 +387,7 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
             end_pkt = base_pkt.copy()
             end_pkt["context_id"] = context_id
             end_pkt["transcript"] = " "
+            sent_tokens.append(" ")
             end_pkt["continue"] = False
             await ws.send_str(json.dumps(end_pkt))
             input_sent_event.set()
@@ -387,6 +404,7 @@ async def _input_task() -> None:
         async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
             current_segment_id: str | None = None
             await input_sent_event.wait()
+            skip_aligning = False
             while True:
                 msg = await ws.receive(timeout=self._conn_options.timeout)
                 if msg.type in (
@@ -416,10 +434,26 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                         output_emitter.end_input()
                         break
                 elif word_timestamps := data.get("word_timestamps"):
+                    # assuming Cartesia echos the sent text in the original format and order.
                     for word, start, end in zip(
                         word_timestamps["words"], word_timestamps["start"], word_timestamps["end"]
                     ):
-                        word = f"{word} "  # TODO(long): any better way to format the words?
+                        if not sent_tokens or skip_aligning:
+                            word = f"{word} "
+                            skip_aligning = True
+                        else:
+                            sent = sent_tokens.popleft()
+                            if (idx := sent.find(word)) != -1:
+                                word, sent = sent[: idx + len(word)], sent[idx + len(word) :]
+                                if sent.strip():
+                                    sent_tokens.appendleft(sent)
+                                elif sent and sent_tokens:
+                                    # merge the remaining whitespace to the next sentence
+                                    sent_tokens[0] = sent + sent_tokens[0]
+                            else:
+                                word = f"{word} "
+                                skip_aligning = True
+
                         output_emitter.push_timed_transcript(
                             TimedString(text=word, start_time=start, end_time=end)
                         )