livekit · Namit1867 · Apr 7, 2026
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py
@@ -9,7 +9,51 @@
 ]
 
 TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo", "sonic-3"]
-TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
+TTSLanguages = Literal[
+    "en",
+    "es",
+    "fr",
+    "de",
+    "pt",
+    "zh",
+    "ja",
+    "hi",
+    "ko",
+    "it",
+    "nl",
+    "pl",
+    "ru",
+    "sv",
+    "tr",
+    "tl",
+    "bg",
+    "ro",
+    "ar",
+    "cs",
+    "el",
+    "fi",
+    "hr",
+    "ms",
+    "sk",
+    "da",
+    "ta",
+    "uk",
+    "hu",
+    "no",
+    "vi",
+    "bn",
+    "th",
+    "he",
+    "ka",
+    "id",
+    "te",
+    "gu",
+    "kn",
+    "ml",
+    "mr",
+    "or",
+    "pa",
+]
 TTSDefaultVoiceId = "f786b574-daa5-4673-aa0c-cbe3e8534c02"  # Katie - Friendly Fixer
 TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
 

diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -70,6 +70,8 @@ class _TTSOptions:
     emotion: list[TTSVoiceEmotion | str] | None
     volume: float | None
     word_timestamps: bool
+    add_phoneme_timestamps: bool
+    use_normalized_timestamps: bool
     api_key: str
     language: LanguageCode | None
     base_url: str
@@ -97,6 +99,8 @@ def __init__(
         volume: float | None = None,
         sample_rate: int = 24000,
         word_timestamps: bool = True,
+        add_phoneme_timestamps: bool = False,
+        use_normalized_timestamps: bool = False,
         pronunciation_dict_id: str | None = None,
         http_session: aiohttp.ClientSession | None = None,
         tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
@@ -119,6 +123,8 @@ def __init__(
             volume (float, optional): Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
             sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
             word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
+            add_phoneme_timestamps (bool, optional): Whether to add phoneme-level timestamps to the output. Defaults to False.
+            use_normalized_timestamps (bool, optional): Whether to use normalized timestamps. Defaults to False.
             pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations. Defaults to None.
             api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
             http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
@@ -157,6 +163,8 @@ def __init__(
             api_key=cartesia_api_key,
             base_url=base_url,
             word_timestamps=word_timestamps,
+            add_phoneme_timestamps=add_phoneme_timestamps,
+            use_normalized_timestamps=use_normalized_timestamps,
             api_version=api_version,
             pronunciation_dict_id=pronunciation_dict_id,
         )
@@ -243,6 +251,8 @@ def update_options(
         emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
         volume: NotGivenOr[float] = NOT_GIVEN,
         pronunciation_dict_id: NotGivenOr[str] = NOT_GIVEN,
+        add_phoneme_timestamps: NotGivenOr[bool] = NOT_GIVEN,
+        use_normalized_timestamps: NotGivenOr[bool] = NOT_GIVEN,
         api_version: NotGivenOr[str] = NOT_GIVEN,
     ) -> None:
         """
@@ -258,6 +268,8 @@ def update_options(
             speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
             emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
             pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations.
+            add_phoneme_timestamps (bool, optional): Whether to add phoneme-level timestamps to the output.
+            use_normalized_timestamps (bool, optional): Whether to use normalized timestamps.
         """
         if is_given(model):
             self._opts.model = model
@@ -274,6 +286,10 @@ def update_options(
             self._opts.volume = volume
         if is_given(pronunciation_dict_id):
             self._opts.pronunciation_dict_id = pronunciation_dict_id
+        if is_given(add_phoneme_timestamps):
+            self._opts.add_phoneme_timestamps = add_phoneme_timestamps
+        if is_given(use_normalized_timestamps):
+            self._opts.use_normalized_timestamps = use_normalized_timestamps
         if is_given(api_version):
             self._opts.api_version = api_version
 
@@ -503,6 +519,8 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse, cartesia_context_id: s
                         output_emitter.push_timed_transcript(
                             TimedString(text=word, start_time=start, end_time=end)
                         )
+                elif data.get("phoneme_timestamps"):
+                    pass  # phoneme_timestamps are received but not surfaced by the output emitter
                 elif data.get("type") == "error":
                     logger.error(
                         "Cartesia returned error. Include the cartesia_context_id to support@cartesia.ai for help debugging.",
@@ -594,5 +612,9 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any
 
     if streaming:
         options["add_timestamps"] = opts.word_timestamps
+        if opts.add_phoneme_timestamps:
+            options["add_phoneme_timestamps"] = True
+        if opts.use_normalized_timestamps:
+            options["use_normalized_timestamps"] = True
 
     return options