feat(minimax): comprehensive TTS updates and parameter rename (#3788)

zhenyujia23-crypto · minimax · minimax · web-flow · commit a42275669e1c · 2025-11-12T14:05:27.000+08:00
Co-authored-by: minimax &lt;minimax@192.168.1.12&gt;
Co-authored-by: minimax &lt;minimax@minimaxdeMacBook-Air-4.local&gt;
Co-authored-by: Long Chen &lt;longch1024@gmail.com&gt;
diff --git a/livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py b/livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py
@@ -25,6 +25,8 @@
 from .log import logger
 
 TTSModel = Literal[
+    "speech-2.6-hd",
+    "speech-2.6-turbo",
     "speech-2.5-hd-preview",
     "speech-2.5-turbo-preview",
     "speech-02-hd",
@@ -34,42 +36,67 @@
 ]
 
 # Minimax TTS Voice IDs
-# Defines small part of supported voices using a Literal type for static analysis.
-# See more voices in docs of Minimax
+# Defines commonly used voices for static analysis.
+# See full voice list in Minimax documentation
 TTSVoice = Literal[
+    # Social Media Voices
+    "socialmedia_female_2_v1",
+    "socialmedia_female_1_v1",
+    # Voice Agent Series
     "voice_agent_Female_Phone_4",
     "voice_agent_Male_Phone_1",
+    "voice_agent_Male_Phone_2",
+    # English Voices - Female
     "English_StressedLady",
     "English_SentimentalLady",
-    "English_WiseScholar",
     "English_radiant_girl",
-    "moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b",
+    # English Voices - Male
+    "English_WiseScholar",
+    "English_Persuasive_Man",
+    "English_Explanatory_Man",
+    "English_Insightful_Speaker",
+    # Japanese Voices
     "japanese_male_social_media_1_v2",
     "japanese_female_social_media_1_v2",
+    # French Voices
     "French_CasualMan",
     "French_Female Journalist",
+    # Spanish Voices
     "Spanish_Narrator",
     "Spanish_WiseScholar",
     "Spanish_ThoughtfulMan",
+    # Arabic Voices
     "Arabic_CalmWoman",
     "Arabic_FriendlyGuy",
+    # Portuguese Voices
     "Portuguese_ThoughtfulLady",
+    # German Voices
     "German_PlayfulMan",
     "German_SweetLady",
+    # MOSS Audio Series
+    "moss_audio_7c7e7ae2-7356-11f0-9540-7ef9b4b62566",
+    "moss_audio_b118f320-78c0-11f0-bbeb-26e8167c4779",
+    "moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b",
+    "moss_audio_82ebf67c-78c8-11f0-8e8e-36b92fbb4f95",
 ]
 
 DEFAULT_MODEL = "speech-02-turbo"
-DEFAULT_VOICE_ID = "English_radiant_girl"
+DEFAULT_VOICE_ID = "socialmedia_female_2_v1"
 
 
-TTSEmotion = Literal["happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"]
+# Note: "fluent" emotion is only supported by speech-2.6-* models
+TTSEmotion = Literal[
+    "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent"
+]
 
 
 TTSAudioFormat = Literal["pcm", "mp3", "flac", "wav"]
 TTSSampleRate = Literal[8000, 16000, 22050, 24000, 32000, 44100]
 TTSBitRate = Literal[32000, 64000, 128000, 256000]  # only for mp3 format
 
-DEFAULT_BASE_URL = "https://api.minimax.io"  # or "https://api.minimaxi.com"
+DEFAULT_BASE_URL = (
+    "https://api-uw.minimax.io"  # or "https://api.minimaxi.chat or https://api.minimax.io"
+)
 
 
 @dataclass
@@ -84,7 +111,7 @@ class _TTSOptions:
     speed: float  # [0.5, 2.0]
     vol: float  # (0, 10]
     pitch: int  # [-12, 12]
-    english_normalization: bool
+    text_normalization: bool
     pronunciation_dict: dict[str, list[str]] | None
     # voice_modify
     intensity: int | None
@@ -102,7 +129,7 @@ def __init__(
         speed: float = 1.0,
         vol: float = 1.0,
         pitch: int = 0,
-        english_normalization: bool = False,
+        text_normalization: bool = False,
         audio_format: TTSAudioFormat = "mp3",
         pronunciation_dict: dict[str, list[str]] | None = None,
         intensity: int | None = None,
@@ -119,12 +146,16 @@ def __init__(
 
         Args:
             model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
+                Available models: speech-2.6-hd, speech-2.6-turbo, speech-2.5-hd-preview,
+                speech-2.5-turbo-preview, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo.
             voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID.
-            emotion (TTSEmotion | None, optional): Emotion control for speech synthesis. Defaults to None.
+            emotion (TTSEmotion | None, optional): Emotion control for speech synthesis.
+                Options: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent".
+                Note: "fluent" emotion is only supported by speech-2.6-* models. Defaults to None.
             speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0].
             vol (float, optional): Speech volume, range is [0, 10].
             pitch (int, optional): Speech pitch adjustment, range is [-12, 12].
-            english_normalization (bool, optional): Enable text normalization in English. Improves performance
+            text_normalization (bool, optional): Enable text normalization (Chinese/English). Improves performance
                 in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
             audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3".
             pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols.
@@ -161,6 +192,13 @@ def __init__(
         if timbre is not None and not (-100 <= timbre <= 100):
             raise ValueError(f"timbre must be between -100 and 100, but got {timbre}")
 
+        # Validate fluent emotion is only used with speech-2.6-* models
+        if emotion == "fluent" and not model.startswith("speech-2.6"):
+            raise ValueError(
+                f'"fluent" emotion is only supported by speech-2.6-* models, '
+                f'but got model "{model}". Please use speech-2.6-hd or speech-2.6-turbo.'
+            )
+
         self._sentence_tokenizer = (
             tokenizer if utils.is_given(tokenizer) else tokenize.basic.SentenceTokenizer()
         )
@@ -182,7 +220,7 @@ def __init__(
             speed=speed,
             pitch=pitch,
             vol=vol,
-            english_normalization=english_normalization,
+            text_normalization=text_normalization,
             timbre=timbre,
             pronunciation_dict=pronunciation_dict,
             intensity=intensity,
@@ -209,7 +247,7 @@ def update_options(
         speed: NotGivenOr[float] = NOT_GIVEN,
         vol: NotGivenOr[float] = NOT_GIVEN,
         pitch: NotGivenOr[int] = NOT_GIVEN,
-        english_normalization: NotGivenOr[bool] = NOT_GIVEN,
+        text_normalization: NotGivenOr[bool] = NOT_GIVEN,
         audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
         pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
         intensity: NotGivenOr[int] = NOT_GIVEN,
@@ -234,8 +272,8 @@ def update_options(
         if utils.is_given(pitch):
             self._opts.pitch = pitch
 
-        if utils.is_given(english_normalization):
-            self._opts.english_normalization = english_normalization
+        if utils.is_given(text_normalization):
+            self._opts.text_normalization = text_normalization
 
         if utils.is_given(audio_format):
             self._opts.audio_format = cast(TTSAudioFormat, audio_format)
@@ -264,6 +302,10 @@ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
         headers = {"Authorization": f"Bearer {self._opts.api_key}"}
         session = self._ensure_session()
         ws = await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout)
+
+        # Log WebSocket connection establishment
+        logger.debug(f"MiniMax WebSocket connected to {url}")
+
         return ws
 
     async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
@@ -295,7 +337,10 @@ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
         self._opts = replace(tts._opts)
 
     async def _run(self, output_emitter: tts.AudioEmitter) -> None:
+        # Initialize with temporary ID, will be updated from WebSocket messages
         request_id = utils.shortuuid()
+        trace_id = request_id  # Use trace_id directly instead of creating a dict
+
         output_emitter.initialize(
             request_id=request_id,
             sample_rate=self._opts.sample_rate,
@@ -339,36 +384,61 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
             await ws.send_str(json.dumps({"event": "task_finish"}))
 
         async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            # Initialize trace_id to ensure it's available in all code paths
+            current_trace_id = trace_id
+
             while True:
                 msg = await ws.receive()
                 if msg.type in (
                     aiohttp.WSMsgType.CLOSED,
                     aiohttp.WSMsgType.CLOSE,
                     aiohttp.WSMsgType.CLOSING,
                 ):
-                    raise APIStatusError(
-                        "Minimax connection closed unexpectedly", request_id=request_id
+                    error_msg = (
+                        f"MiniMax connection closed unexpectedly (trace_id: {current_trace_id})"
                     )
+                    logger.error(error_msg)
+                    raise APIStatusError(error_msg, request_id=current_trace_id)
 
                 if msg.type != aiohttp.WSMsgType.TEXT:
                     logger.warning("unexpected Minimax message type %s", msg.type)
                     continue
 
                 data: dict[str, Any] = json.loads(msg.data)
-                status_code = data.get("base_resp", {}).get("status_code")
+
+                # Extract trace_id (priority: root.trace_id > base_resp.trace_id)
+                # api.minimax.io returns trace_id in root.trace_id, api.minimaxi.com may return in base_resp.trace_id
+                msg_trace_id = data.get("trace_id") or data.get("base_resp", {}).get("trace_id")
+                if msg_trace_id and msg_trace_id != current_trace_id:
+                    current_trace_id = msg_trace_id
+                    logger.debug(f"MiniMax WebSocket trace_id updated: {msg_trace_id}")
+
+                base_resp = data.get("base_resp", {})
+                status_code = base_resp.get("status_code", 0)
                 if status_code != 0:
+                    status_msg = base_resp.get("status_msg", "Unknown error")
+                    error_trace_id = msg_trace_id or current_trace_id
+
+                    logger.error(
+                        f"MiniMax WebSocket error: code={status_code}, msg={status_msg}, trace_id={error_trace_id}",
+                        extra={"request_id": request_id, "full_response": data},
+                    )
+
                     raise APIStatusError(
-                        f"Minimax returned non-zero status code: {status_code}",
-                        request_id=request_id,
+                        f"MiniMax error [{status_code}]: {status_msg} (trace_id: {error_trace_id})",
+                        request_id=error_trace_id,
                         body=data,
                     )
 
                 if data.get("event") == "connected_success":
-                    pass
+                    logger.debug(f"MiniMax WebSocket connected, trace_id={current_trace_id}")
 
                 elif data.get("event") == "task_started":
                     task_started.set_result(None)
                     session_id = data.get("session_id", "")
+                    logger.debug(
+                        f"MiniMax WebSocket task_started, session_id={session_id}, trace_id={current_trace_id}"
+                    )
                     output_emitter.start_segment(segment_id=session_id)
 
                 elif data.get("event") == "task_continued":
@@ -383,7 +453,11 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                     break
 
                 elif data.get("event") == "task_failed":
-                    raise APIError(f"Minimax returned task failed: {msg.data}")
+                    error_msg = (
+                        f"MiniMax returned task failed (trace_id: {current_trace_id}): {msg.data}"
+                    )
+                    logger.error(error_msg)
+                    raise APIError(error_msg)
 
                 else:
                     logger.warning(f"unexpected Minimax message: {msg.data}")
@@ -404,13 +478,32 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                 await utils.aio.gracefully_cancel(*tasks)
 
         except asyncio.TimeoutError:
-            raise APITimeoutError() from None
+            logger.error(
+                f"MiniMax WebSocket request timeout after {self._conn_options.timeout}s, trace_id={trace_id}"
+            )
+            raise APITimeoutError(
+                f"WebSocket TTS synthesis timed out after {self._conn_options.timeout}s (trace_id: {trace_id})"
+            ) from None
         except aiohttp.ClientResponseError as e:
+            logger.error(
+                f"WebSocket HTTP error: status={e.status}, message={e.message}, trace_id={trace_id}",
+                exc_info=True,
+            )
             raise APIStatusError(
-                message=e.message, status_code=e.status, request_id=None, body=None
+                message=f"WebSocket HTTP {e.status}: {e.message} (trace_id: {trace_id})",
+                status_code=e.status,
+                request_id=trace_id,
+                body=None,
             ) from e
         except Exception as e:
-            raise APIConnectionError() from e
+            if not isinstance(e, (APIStatusError, APITimeoutError, APIConnectionError)):
+                logger.error(
+                    f"MiniMax WebSocket unexpected error: {type(e).__name__}: {e}, trace_id={trace_id}",
+                    exc_info=True,
+                )
+            raise APIConnectionError(
+                f"WebSocket connection failed: {type(e).__name__}: {e} (trace_id: {trace_id})"
+            ) from e
 
     async def aclose(self) -> None:
         await super().aclose()
@@ -448,8 +541,23 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
             ) as resp:
                 resp.raise_for_status()
 
+                # Extract trace_id from response headers (all requests have this)
+                # Note: api.minimax.io also returns trace_id in response body root.trace_id
+                trace_id = resp.headers.get("Trace-Id") or resp.headers.get("X-Trace-Id")
+                minimax_request_id = resp.headers.get("Minimax-Request-Id")
+
+                if trace_id:
+                    logger.debug(
+                        f"MiniMax HTTP stream request started, trace_id={trace_id}, minimax_request_id={minimax_request_id}"
+                    )
+                else:
+                    trace_id = utils.shortuuid()
+                    logger.warning(
+                        f"No Trace-Id in response headers, using generated ID: {trace_id}"
+                    )
+
                 output_emitter.initialize(
-                    request_id=utils.shortuuid(),
+                    request_id=trace_id,
                     sample_rate=self._opts.sample_rate,
                     num_channels=1,
                     mime_type=f"audio/{self._opts.audio_format}",
@@ -464,24 +572,53 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
                         continue
 
                     data = json.loads(line[5:])
+
+                    # api.minimax.io returns trace_id in response body root level
+                    body_trace_id = data.get("trace_id")
+                    if body_trace_id and body_trace_id != trace_id:
+                        logger.debug(f"Found trace_id in response body: {body_trace_id}")
+
                     if audio := data.get("data", {}).get("audio"):
                         output_emitter.push(bytes.fromhex(audio))
-                    elif (status_code := data.get("base_resp", {}).get("status_code")) != 0:
-                        raise APIStatusError(
-                            f"Minimax returned non-zero status code: {status_code}",
-                            request_id=None,
-                            body=data,
-                        )
+                    else:
+                        base_resp = data.get("base_resp", {})
+                        status_code = base_resp.get("status_code", 0)
+                        if status_code != 0:
+                            status_msg = base_resp.get("status_msg", "Unknown error")
+                            # trace_id priority: response body top level > response headers
+                            error_trace_id = body_trace_id or trace_id
+
+                            logger.error(
+                                f"MiniMax HTTP stream error: code={status_code}, msg={status_msg}, trace_id={error_trace_id}",
+                                extra={"full_response": data},
+                            )
+
+                            raise APIStatusError(
+                                f"MiniMax error [{status_code}]: {status_msg} (trace_id: {error_trace_id})",
+                                request_id=error_trace_id,
+                                body=data,
+                            )
                 output_emitter.flush()
 
         except asyncio.TimeoutError:
-            raise APITimeoutError() from None
+            logger.error(f"Minimax HTTP stream request timeout after {self._conn_options.timeout}s")
+            raise APITimeoutError(
+                f"TTS synthesis timed out after {self._conn_options.timeout}s"
+            ) from None
         except aiohttp.ClientResponseError as e:
+            logger.error(f"HTTP error: status={e.status}, message={e.message}", exc_info=True)
             raise APIStatusError(
-                message=e.message, status_code=e.status, request_id=None, body=None
-            ) from None
+                message=f"HTTP {e.status}: {e.message}",
+                status_code=e.status,
+                request_id=None,
+                body=None,
+            ) from e
         except Exception as e:
-            raise APIConnectionError() from e
+            if not isinstance(e, (APIStatusError, APITimeoutError, APIConnectionError)):
+                logger.error(
+                    f"Minimax TTS unexpected error: {type(e).__name__}: {e}", exc_info=True
+                )
+            raise APIConnectionError(f"Connection failed: {type(e).__name__}: {e}") from e
 
 
 def _to_minimax_options(opts: _TTSOptions) -> dict[str, Any]:
@@ -499,6 +636,7 @@ def _to_minimax_options(opts: _TTSOptions) -> dict[str, Any]:
             "format": opts.audio_format,
             "channel": 1,
         },
+        "text_normalization": opts.text_normalization,
     }
 
     if opts.emotion is not None: