2525from .log import logger
2626
2727TTSModel = Literal [
28+ "speech-2.6-hd" ,
29+ "speech-2.6-turbo" ,
2830 "speech-2.5-hd-preview" ,
2931 "speech-2.5-turbo-preview" ,
3032 "speech-02-hd" ,
3436]
3537
3638# Minimax TTS Voice IDs
37- # Defines small part of supported voices using a Literal type for static analysis.
38- # See more voices in docs of Minimax
39+ # Defines commonly used voices for static analysis.
40+ # See full voice list in Minimax documentation
3941TTSVoice = Literal [
42+ # Social Media Voices
43+ "socialmedia_female_2_v1" ,
44+ "socialmedia_female_1_v1" ,
45+ # Voice Agent Series
4046 "voice_agent_Female_Phone_4" ,
4147 "voice_agent_Male_Phone_1" ,
48+ "voice_agent_Male_Phone_2" ,
49+ # English Voices - Female
4250 "English_StressedLady" ,
4351 "English_SentimentalLady" ,
44- "English_WiseScholar" ,
4552 "English_radiant_girl" ,
46- "moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b" ,
53+ # English Voices - Male
54+ "English_WiseScholar" ,
55+ "English_Persuasive_Man" ,
56+ "English_Explanatory_Man" ,
57+ "English_Insightful_Speaker" ,
58+ # Japanese Voices
4759 "japanese_male_social_media_1_v2" ,
4860 "japanese_female_social_media_1_v2" ,
61+ # French Voices
4962 "French_CasualMan" ,
5063 "French_Female Journalist" ,
64+ # Spanish Voices
5165 "Spanish_Narrator" ,
5266 "Spanish_WiseScholar" ,
5367 "Spanish_ThoughtfulMan" ,
68+ # Arabic Voices
5469 "Arabic_CalmWoman" ,
5570 "Arabic_FriendlyGuy" ,
71+ # Portuguese Voices
5672 "Portuguese_ThoughtfulLady" ,
73+ # German Voices
5774 "German_PlayfulMan" ,
5875 "German_SweetLady" ,
76+ # MOSS Audio Series
77+ "moss_audio_7c7e7ae2-7356-11f0-9540-7ef9b4b62566" ,
78+ "moss_audio_b118f320-78c0-11f0-bbeb-26e8167c4779" ,
79+ "moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b" ,
80+ "moss_audio_82ebf67c-78c8-11f0-8e8e-36b92fbb4f95" ,
5981]
6082
6183DEFAULT_MODEL = "speech-02-turbo"
62- DEFAULT_VOICE_ID = "English_radiant_girl "
84+ DEFAULT_VOICE_ID = "socialmedia_female_2_v1 "
6385
6486
65- TTSEmotion = Literal ["happy" , "sad" , "angry" , "fearful" , "disgusted" , "surprised" , "neutral" ]
87+ # Note: "fluent" emotion is only supported by speech-2.6-* models
88+ TTSEmotion = Literal [
89+ "happy" , "sad" , "angry" , "fearful" , "disgusted" , "surprised" , "neutral" , "fluent"
90+ ]
6691
6792
6893TTSAudioFormat = Literal ["pcm" , "mp3" , "flac" , "wav" ]
6994TTSSampleRate = Literal [8000 , 16000 , 22050 , 24000 , 32000 , 44100 ]
7095TTSBitRate = Literal [32000 , 64000 , 128000 , 256000 ] # only for mp3 format
7196
72- DEFAULT_BASE_URL = "https://api.minimax.io" # or "https://api.minimaxi.com"
97+ DEFAULT_BASE_URL = (
98+ "https://api-uw.minimax.io" # or "https://api.minimaxi.chat or https://api.minimax.io"
99+ )
73100
74101
75102@dataclass
@@ -84,7 +111,7 @@ class _TTSOptions:
84111 speed : float # [0.5, 2.0]
85112 vol : float # (0, 10]
86113 pitch : int # [-12, 12]
87- english_normalization : bool
114+ text_normalization : bool
88115 pronunciation_dict : dict [str , list [str ]] | None
89116 # voice_modify
90117 intensity : int | None
@@ -102,7 +129,7 @@ def __init__(
102129 speed : float = 1.0 ,
103130 vol : float = 1.0 ,
104131 pitch : int = 0 ,
105- english_normalization : bool = False ,
132+ text_normalization : bool = False ,
106133 audio_format : TTSAudioFormat = "mp3" ,
107134 pronunciation_dict : dict [str , list [str ]] | None = None ,
108135 intensity : int | None = None ,
@@ -119,12 +146,16 @@ def __init__(
119146
120147 Args:
121148 model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
149+ Available models: speech-2.6-hd, speech-2.6-turbo, speech-2.5-hd-preview,
150+ speech-2.5-turbo-preview, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo.
122151 voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID.
123- emotion (TTSEmotion | None, optional): Emotion control for speech synthesis. Defaults to None.
152+ emotion (TTSEmotion | None, optional): Emotion control for speech synthesis.
153+ Options: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent".
154+ Note: "fluent" emotion is only supported by speech-2.6-* models. Defaults to None.
124155 speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0].
125156 vol (float, optional): Speech volume, range is [0, 10].
126157 pitch (int, optional): Speech pitch adjustment, range is [-12, 12].
127- english_normalization (bool, optional): Enable text normalization in English. Improves performance
158+ text_normalization (bool, optional): Enable text normalization (Chinese/ English) . Improves performance
128159 in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
129160 audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3".
130161 pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols.
@@ -161,6 +192,13 @@ def __init__(
161192 if timbre is not None and not (- 100 <= timbre <= 100 ):
162193 raise ValueError (f"timbre must be between -100 and 100, but got { timbre } " )
163194
195+ # Validate fluent emotion is only used with speech-2.6-* models
196+ if emotion == "fluent" and not model .startswith ("speech-2.6" ):
197+ raise ValueError (
198+ f'"fluent" emotion is only supported by speech-2.6-* models, '
199+ f'but got model "{ model } ". Please use speech-2.6-hd or speech-2.6-turbo.'
200+ )
201+
164202 self ._sentence_tokenizer = (
165203 tokenizer if utils .is_given (tokenizer ) else tokenize .basic .SentenceTokenizer ()
166204 )
@@ -182,7 +220,7 @@ def __init__(
182220 speed = speed ,
183221 pitch = pitch ,
184222 vol = vol ,
185- english_normalization = english_normalization ,
223+ text_normalization = text_normalization ,
186224 timbre = timbre ,
187225 pronunciation_dict = pronunciation_dict ,
188226 intensity = intensity ,
@@ -209,7 +247,7 @@ def update_options(
209247 speed : NotGivenOr [float ] = NOT_GIVEN ,
210248 vol : NotGivenOr [float ] = NOT_GIVEN ,
211249 pitch : NotGivenOr [int ] = NOT_GIVEN ,
212- english_normalization : NotGivenOr [bool ] = NOT_GIVEN ,
250+ text_normalization : NotGivenOr [bool ] = NOT_GIVEN ,
213251 audio_format : NotGivenOr [TTSAudioFormat ] = NOT_GIVEN ,
214252 pronunciation_dict : NotGivenOr [dict [str , list [str ]]] = NOT_GIVEN ,
215253 intensity : NotGivenOr [int ] = NOT_GIVEN ,
@@ -234,8 +272,8 @@ def update_options(
234272 if utils .is_given (pitch ):
235273 self ._opts .pitch = pitch
236274
237- if utils .is_given (english_normalization ):
238- self ._opts .english_normalization = english_normalization
275+ if utils .is_given (text_normalization ):
276+ self ._opts .text_normalization = text_normalization
239277
240278 if utils .is_given (audio_format ):
241279 self ._opts .audio_format = cast (TTSAudioFormat , audio_format )
@@ -264,6 +302,10 @@ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
264302 headers = {"Authorization" : f"Bearer { self ._opts .api_key } " }
265303 session = self ._ensure_session ()
266304 ws = await asyncio .wait_for (session .ws_connect (url , headers = headers ), timeout )
305+
306+ # Log WebSocket connection establishment
307+ logger .debug (f"MiniMax WebSocket connected to { url } " )
308+
267309 return ws
268310
269311 async def _close_ws (self , ws : aiohttp .ClientWebSocketResponse ) -> None :
@@ -295,7 +337,10 @@ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
295337 self ._opts = replace (tts ._opts )
296338
297339 async def _run (self , output_emitter : tts .AudioEmitter ) -> None :
340+ # Initialize with temporary ID, will be updated from WebSocket messages
298341 request_id = utils .shortuuid ()
342+ trace_id = request_id # Use trace_id directly instead of creating a dict
343+
299344 output_emitter .initialize (
300345 request_id = request_id ,
301346 sample_rate = self ._opts .sample_rate ,
@@ -339,36 +384,61 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
339384 await ws .send_str (json .dumps ({"event" : "task_finish" }))
340385
341386 async def _recv_task (ws : aiohttp .ClientWebSocketResponse ) -> None :
387+ # Initialize trace_id to ensure it's available in all code paths
388+ current_trace_id = trace_id
389+
342390 while True :
343391 msg = await ws .receive ()
344392 if msg .type in (
345393 aiohttp .WSMsgType .CLOSED ,
346394 aiohttp .WSMsgType .CLOSE ,
347395 aiohttp .WSMsgType .CLOSING ,
348396 ):
349- raise APIStatusError (
350- "Minimax connection closed unexpectedly" , request_id = request_id
397+ error_msg = (
398+ f"MiniMax connection closed unexpectedly (trace_id: { current_trace_id } )"
351399 )
400+ logger .error (error_msg )
401+ raise APIStatusError (error_msg , request_id = current_trace_id )
352402
353403 if msg .type != aiohttp .WSMsgType .TEXT :
354404 logger .warning ("unexpected Minimax message type %s" , msg .type )
355405 continue
356406
357407 data : dict [str , Any ] = json .loads (msg .data )
358- status_code = data .get ("base_resp" , {}).get ("status_code" )
408+
409+ # Extract trace_id (priority: root.trace_id > base_resp.trace_id)
410+ # api.minimax.io returns trace_id in root.trace_id, api.minimaxi.com may return in base_resp.trace_id
411+ msg_trace_id = data .get ("trace_id" ) or data .get ("base_resp" , {}).get ("trace_id" )
412+ if msg_trace_id and msg_trace_id != current_trace_id :
413+ current_trace_id = msg_trace_id
414+ logger .debug (f"MiniMax WebSocket trace_id updated: { msg_trace_id } " )
415+
416+ base_resp = data .get ("base_resp" , {})
417+ status_code = base_resp .get ("status_code" , 0 )
359418 if status_code != 0 :
419+ status_msg = base_resp .get ("status_msg" , "Unknown error" )
420+ error_trace_id = msg_trace_id or current_trace_id
421+
422+ logger .error (
423+ f"MiniMax WebSocket error: code={ status_code } , msg={ status_msg } , trace_id={ error_trace_id } " ,
424+ extra = {"request_id" : request_id , "full_response" : data },
425+ )
426+
360427 raise APIStatusError (
361- f"Minimax returned non-zero status code : { status_code } " ,
362- request_id = request_id ,
428+ f"MiniMax error [ { status_code } ]: { status_msg } (trace_id : { error_trace_id } ) " ,
429+ request_id = error_trace_id ,
363430 body = data ,
364431 )
365432
366433 if data .get ("event" ) == "connected_success" :
367- pass
434+ logger . debug ( f"MiniMax WebSocket connected, trace_id= { current_trace_id } " )
368435
369436 elif data .get ("event" ) == "task_started" :
370437 task_started .set_result (None )
371438 session_id = data .get ("session_id" , "" )
439+ logger .debug (
440+ f"MiniMax WebSocket task_started, session_id={ session_id } , trace_id={ current_trace_id } "
441+ )
372442 output_emitter .start_segment (segment_id = session_id )
373443
374444 elif data .get ("event" ) == "task_continued" :
@@ -383,7 +453,11 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
383453 break
384454
385455 elif data .get ("event" ) == "task_failed" :
386- raise APIError (f"Minimax returned task failed: { msg .data } " )
456+ error_msg = (
457+ f"MiniMax returned task failed (trace_id: { current_trace_id } ): { msg .data } "
458+ )
459+ logger .error (error_msg )
460+ raise APIError (error_msg )
387461
388462 else :
389463 logger .warning (f"unexpected Minimax message: { msg .data } " )
@@ -404,13 +478,32 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
404478 await utils .aio .gracefully_cancel (* tasks )
405479
406480 except asyncio .TimeoutError :
407- raise APITimeoutError () from None
481+ logger .error (
482+ f"MiniMax WebSocket request timeout after { self ._conn_options .timeout } s, trace_id={ trace_id } "
483+ )
484+ raise APITimeoutError (
485+ f"WebSocket TTS synthesis timed out after { self ._conn_options .timeout } s (trace_id: { trace_id } )"
486+ ) from None
408487 except aiohttp .ClientResponseError as e :
488+ logger .error (
489+ f"WebSocket HTTP error: status={ e .status } , message={ e .message } , trace_id={ trace_id } " ,
490+ exc_info = True ,
491+ )
409492 raise APIStatusError (
410- message = e .message , status_code = e .status , request_id = None , body = None
493+ message = f"WebSocket HTTP { e .status } : { e .message } (trace_id: { trace_id } )" ,
494+ status_code = e .status ,
495+ request_id = trace_id ,
496+ body = None ,
411497 ) from e
412498 except Exception as e :
413- raise APIConnectionError () from e
499+ if not isinstance (e , (APIStatusError , APITimeoutError , APIConnectionError )):
500+ logger .error (
501+ f"MiniMax WebSocket unexpected error: { type (e ).__name__ } : { e } , trace_id={ trace_id } " ,
502+ exc_info = True ,
503+ )
504+ raise APIConnectionError (
505+ f"WebSocket connection failed: { type (e ).__name__ } : { e } (trace_id: { trace_id } )"
506+ ) from e
414507
415508 async def aclose (self ) -> None :
416509 await super ().aclose ()
@@ -448,8 +541,23 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
448541 ) as resp :
449542 resp .raise_for_status ()
450543
544+ # Extract trace_id from response headers (all requests have this)
545+ # Note: api.minimax.io also returns trace_id in response body root.trace_id
546+ trace_id = resp .headers .get ("Trace-Id" ) or resp .headers .get ("X-Trace-Id" )
547+ minimax_request_id = resp .headers .get ("Minimax-Request-Id" )
548+
549+ if trace_id :
550+ logger .debug (
551+ f"MiniMax HTTP stream request started, trace_id={ trace_id } , minimax_request_id={ minimax_request_id } "
552+ )
553+ else :
554+ trace_id = utils .shortuuid ()
555+ logger .warning (
556+ f"No Trace-Id in response headers, using generated ID: { trace_id } "
557+ )
558+
451559 output_emitter .initialize (
452- request_id = utils . shortuuid () ,
560+ request_id = trace_id ,
453561 sample_rate = self ._opts .sample_rate ,
454562 num_channels = 1 ,
455563 mime_type = f"audio/{ self ._opts .audio_format } " ,
@@ -464,24 +572,53 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
464572 continue
465573
466574 data = json .loads (line [5 :])
575+
576+ # api.minimax.io returns trace_id in response body root level
577+ body_trace_id = data .get ("trace_id" )
578+ if body_trace_id and body_trace_id != trace_id :
579+ logger .debug (f"Found trace_id in response body: { body_trace_id } " )
580+
467581 if audio := data .get ("data" , {}).get ("audio" ):
468582 output_emitter .push (bytes .fromhex (audio ))
469- elif (status_code := data .get ("base_resp" , {}).get ("status_code" )) != 0 :
470- raise APIStatusError (
471- f"Minimax returned non-zero status code: { status_code } " ,
472- request_id = None ,
473- body = data ,
474- )
583+ else :
584+ base_resp = data .get ("base_resp" , {})
585+ status_code = base_resp .get ("status_code" , 0 )
586+ if status_code != 0 :
587+ status_msg = base_resp .get ("status_msg" , "Unknown error" )
588+ # trace_id priority: response body top level > response headers
589+ error_trace_id = body_trace_id or trace_id
590+
591+ logger .error (
592+ f"MiniMax HTTP stream error: code={ status_code } , msg={ status_msg } , trace_id={ error_trace_id } " ,
593+ extra = {"full_response" : data },
594+ )
595+
596+ raise APIStatusError (
597+ f"MiniMax error [{ status_code } ]: { status_msg } (trace_id: { error_trace_id } )" ,
598+ request_id = error_trace_id ,
599+ body = data ,
600+ )
475601 output_emitter .flush ()
476602
477603 except asyncio .TimeoutError :
478- raise APITimeoutError () from None
604+ logger .error (f"Minimax HTTP stream request timeout after { self ._conn_options .timeout } s" )
605+ raise APITimeoutError (
606+ f"TTS synthesis timed out after { self ._conn_options .timeout } s"
607+ ) from None
479608 except aiohttp .ClientResponseError as e :
609+ logger .error (f"HTTP error: status={ e .status } , message={ e .message } " , exc_info = True )
480610 raise APIStatusError (
481- message = e .message , status_code = e .status , request_id = None , body = None
482- ) from None
611+ message = f"HTTP { e .status } : { e .message } " ,
612+ status_code = e .status ,
613+ request_id = None ,
614+ body = None ,
615+ ) from e
483616 except Exception as e :
484- raise APIConnectionError () from e
617+ if not isinstance (e , (APIStatusError , APITimeoutError , APIConnectionError )):
618+ logger .error (
619+ f"Minimax TTS unexpected error: { type (e ).__name__ } : { e } " , exc_info = True
620+ )
621+ raise APIConnectionError (f"Connection failed: { type (e ).__name__ } : { e } " ) from e
485622
486623
487624def _to_minimax_options (opts : _TTSOptions ) -> dict [str , Any ]:
@@ -499,6 +636,7 @@ def _to_minimax_options(opts: _TTSOptions) -> dict[str, Any]:
499636 "format" : opts .audio_format ,
500637 "channel" : 1 ,
501638 },
639+ "text_normalization" : opts .text_normalization ,
502640 }
503641
504642 if opts .emotion is not None :
0 commit comments