Skip to content

Commit a422756

Browse files
zhenyujia23-cryptominimaxminimaxlongcw
authored
feat(minimax): comprehensive TTS updates and parameter rename (#3788)
Co-authored-by: minimax <[email protected]> Co-authored-by: minimax <[email protected]> Co-authored-by: Long Chen <[email protected]>
1 parent a18d5e9 commit a422756

File tree

1 file changed

+174
-36
lines changed
  • livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax

1 file changed

+174
-36
lines changed

livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py

Lines changed: 174 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from .log import logger
2626

2727
TTSModel = Literal[
28+
"speech-2.6-hd",
29+
"speech-2.6-turbo",
2830
"speech-2.5-hd-preview",
2931
"speech-2.5-turbo-preview",
3032
"speech-02-hd",
@@ -34,42 +36,67 @@
3436
]
3537

3638
# Minimax TTS Voice IDs
37-
# Defines small part of supported voices using a Literal type for static analysis.
38-
# See more voices in docs of Minimax
39+
# Defines commonly used voices for static analysis.
40+
# See full voice list in Minimax documentation
3941
TTSVoice = Literal[
42+
# Social Media Voices
43+
"socialmedia_female_2_v1",
44+
"socialmedia_female_1_v1",
45+
# Voice Agent Series
4046
"voice_agent_Female_Phone_4",
4147
"voice_agent_Male_Phone_1",
48+
"voice_agent_Male_Phone_2",
49+
# English Voices - Female
4250
"English_StressedLady",
4351
"English_SentimentalLady",
44-
"English_WiseScholar",
4552
"English_radiant_girl",
46-
"moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b",
53+
# English Voices - Male
54+
"English_WiseScholar",
55+
"English_Persuasive_Man",
56+
"English_Explanatory_Man",
57+
"English_Insightful_Speaker",
58+
# Japanese Voices
4759
"japanese_male_social_media_1_v2",
4860
"japanese_female_social_media_1_v2",
61+
# French Voices
4962
"French_CasualMan",
5063
"French_Female Journalist",
64+
# Spanish Voices
5165
"Spanish_Narrator",
5266
"Spanish_WiseScholar",
5367
"Spanish_ThoughtfulMan",
68+
# Arabic Voices
5469
"Arabic_CalmWoman",
5570
"Arabic_FriendlyGuy",
71+
# Portuguese Voices
5672
"Portuguese_ThoughtfulLady",
73+
# German Voices
5774
"German_PlayfulMan",
5875
"German_SweetLady",
76+
# MOSS Audio Series
77+
"moss_audio_7c7e7ae2-7356-11f0-9540-7ef9b4b62566",
78+
"moss_audio_b118f320-78c0-11f0-bbeb-26e8167c4779",
79+
"moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b",
80+
"moss_audio_82ebf67c-78c8-11f0-8e8e-36b92fbb4f95",
5981
]
6082

6183
DEFAULT_MODEL = "speech-02-turbo"
62-
DEFAULT_VOICE_ID = "English_radiant_girl"
84+
DEFAULT_VOICE_ID = "socialmedia_female_2_v1"
6385

6486

65-
TTSEmotion = Literal["happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"]
87+
# Note: "fluent" emotion is only supported by speech-2.6-* models
88+
TTSEmotion = Literal[
89+
"happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent"
90+
]
6691

6792

6893
TTSAudioFormat = Literal["pcm", "mp3", "flac", "wav"]
6994
TTSSampleRate = Literal[8000, 16000, 22050, 24000, 32000, 44100]
7095
TTSBitRate = Literal[32000, 64000, 128000, 256000] # only for mp3 format
7196

72-
DEFAULT_BASE_URL = "https://api.minimax.io" # or "https://api.minimaxi.com"
97+
DEFAULT_BASE_URL = (
98+
"https://api-uw.minimax.io" # or "https://api.minimaxi.chat or https://api.minimax.io"
99+
)
73100

74101

75102
@dataclass
@@ -84,7 +111,7 @@ class _TTSOptions:
84111
speed: float # [0.5, 2.0]
85112
vol: float # (0, 10]
86113
pitch: int # [-12, 12]
87-
english_normalization: bool
114+
text_normalization: bool
88115
pronunciation_dict: dict[str, list[str]] | None
89116
# voice_modify
90117
intensity: int | None
@@ -102,7 +129,7 @@ def __init__(
102129
speed: float = 1.0,
103130
vol: float = 1.0,
104131
pitch: int = 0,
105-
english_normalization: bool = False,
132+
text_normalization: bool = False,
106133
audio_format: TTSAudioFormat = "mp3",
107134
pronunciation_dict: dict[str, list[str]] | None = None,
108135
intensity: int | None = None,
@@ -119,12 +146,16 @@ def __init__(
119146
120147
Args:
121148
model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
149+
Available models: speech-2.6-hd, speech-2.6-turbo, speech-2.5-hd-preview,
150+
speech-2.5-turbo-preview, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo.
122151
voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID.
123-
emotion (TTSEmotion | None, optional): Emotion control for speech synthesis. Defaults to None.
152+
emotion (TTSEmotion | None, optional): Emotion control for speech synthesis.
153+
Options: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent".
154+
Note: "fluent" emotion is only supported by speech-2.6-* models. Defaults to None.
124155
speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0].
125156
vol (float, optional): Speech volume, range is [0, 10].
126157
pitch (int, optional): Speech pitch adjustment, range is [-12, 12].
127-
english_normalization (bool, optional): Enable text normalization in English. Improves performance
158+
text_normalization (bool, optional): Enable text normalization (Chinese/English). Improves performance
128159
in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
129160
audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3".
130161
pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols.
@@ -161,6 +192,13 @@ def __init__(
161192
if timbre is not None and not (-100 <= timbre <= 100):
162193
raise ValueError(f"timbre must be between -100 and 100, but got {timbre}")
163194

195+
# Validate fluent emotion is only used with speech-2.6-* models
196+
if emotion == "fluent" and not model.startswith("speech-2.6"):
197+
raise ValueError(
198+
f'"fluent" emotion is only supported by speech-2.6-* models, '
199+
f'but got model "{model}". Please use speech-2.6-hd or speech-2.6-turbo.'
200+
)
201+
164202
self._sentence_tokenizer = (
165203
tokenizer if utils.is_given(tokenizer) else tokenize.basic.SentenceTokenizer()
166204
)
@@ -182,7 +220,7 @@ def __init__(
182220
speed=speed,
183221
pitch=pitch,
184222
vol=vol,
185-
english_normalization=english_normalization,
223+
text_normalization=text_normalization,
186224
timbre=timbre,
187225
pronunciation_dict=pronunciation_dict,
188226
intensity=intensity,
@@ -209,7 +247,7 @@ def update_options(
209247
speed: NotGivenOr[float] = NOT_GIVEN,
210248
vol: NotGivenOr[float] = NOT_GIVEN,
211249
pitch: NotGivenOr[int] = NOT_GIVEN,
212-
english_normalization: NotGivenOr[bool] = NOT_GIVEN,
250+
text_normalization: NotGivenOr[bool] = NOT_GIVEN,
213251
audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
214252
pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
215253
intensity: NotGivenOr[int] = NOT_GIVEN,
@@ -234,8 +272,8 @@ def update_options(
234272
if utils.is_given(pitch):
235273
self._opts.pitch = pitch
236274

237-
if utils.is_given(english_normalization):
238-
self._opts.english_normalization = english_normalization
275+
if utils.is_given(text_normalization):
276+
self._opts.text_normalization = text_normalization
239277

240278
if utils.is_given(audio_format):
241279
self._opts.audio_format = cast(TTSAudioFormat, audio_format)
@@ -264,6 +302,10 @@ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
264302
headers = {"Authorization": f"Bearer {self._opts.api_key}"}
265303
session = self._ensure_session()
266304
ws = await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout)
305+
306+
# Log WebSocket connection establishment
307+
logger.debug(f"MiniMax WebSocket connected to {url}")
308+
267309
return ws
268310

269311
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
@@ -295,7 +337,10 @@ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
295337
self._opts = replace(tts._opts)
296338

297339
async def _run(self, output_emitter: tts.AudioEmitter) -> None:
340+
# Initialize with temporary ID, will be updated from WebSocket messages
298341
request_id = utils.shortuuid()
342+
trace_id = request_id # Use trace_id directly instead of creating a dict
343+
299344
output_emitter.initialize(
300345
request_id=request_id,
301346
sample_rate=self._opts.sample_rate,
@@ -339,36 +384,61 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
339384
await ws.send_str(json.dumps({"event": "task_finish"}))
340385

341386
async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
387+
# Initialize trace_id to ensure it's available in all code paths
388+
current_trace_id = trace_id
389+
342390
while True:
343391
msg = await ws.receive()
344392
if msg.type in (
345393
aiohttp.WSMsgType.CLOSED,
346394
aiohttp.WSMsgType.CLOSE,
347395
aiohttp.WSMsgType.CLOSING,
348396
):
349-
raise APIStatusError(
350-
"Minimax connection closed unexpectedly", request_id=request_id
397+
error_msg = (
398+
f"MiniMax connection closed unexpectedly (trace_id: {current_trace_id})"
351399
)
400+
logger.error(error_msg)
401+
raise APIStatusError(error_msg, request_id=current_trace_id)
352402

353403
if msg.type != aiohttp.WSMsgType.TEXT:
354404
logger.warning("unexpected Minimax message type %s", msg.type)
355405
continue
356406

357407
data: dict[str, Any] = json.loads(msg.data)
358-
status_code = data.get("base_resp", {}).get("status_code")
408+
409+
# Extract trace_id (priority: root.trace_id > base_resp.trace_id)
410+
# api.minimax.io returns trace_id in root.trace_id, api.minimaxi.com may return in base_resp.trace_id
411+
msg_trace_id = data.get("trace_id") or data.get("base_resp", {}).get("trace_id")
412+
if msg_trace_id and msg_trace_id != current_trace_id:
413+
current_trace_id = msg_trace_id
414+
logger.debug(f"MiniMax WebSocket trace_id updated: {msg_trace_id}")
415+
416+
base_resp = data.get("base_resp", {})
417+
status_code = base_resp.get("status_code", 0)
359418
if status_code != 0:
419+
status_msg = base_resp.get("status_msg", "Unknown error")
420+
error_trace_id = msg_trace_id or current_trace_id
421+
422+
logger.error(
423+
f"MiniMax WebSocket error: code={status_code}, msg={status_msg}, trace_id={error_trace_id}",
424+
extra={"request_id": request_id, "full_response": data},
425+
)
426+
360427
raise APIStatusError(
361-
f"Minimax returned non-zero status code: {status_code}",
362-
request_id=request_id,
428+
f"MiniMax error [{status_code}]: {status_msg} (trace_id: {error_trace_id})",
429+
request_id=error_trace_id,
363430
body=data,
364431
)
365432

366433
if data.get("event") == "connected_success":
367-
pass
434+
logger.debug(f"MiniMax WebSocket connected, trace_id={current_trace_id}")
368435

369436
elif data.get("event") == "task_started":
370437
task_started.set_result(None)
371438
session_id = data.get("session_id", "")
439+
logger.debug(
440+
f"MiniMax WebSocket task_started, session_id={session_id}, trace_id={current_trace_id}"
441+
)
372442
output_emitter.start_segment(segment_id=session_id)
373443

374444
elif data.get("event") == "task_continued":
@@ -383,7 +453,11 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
383453
break
384454

385455
elif data.get("event") == "task_failed":
386-
raise APIError(f"Minimax returned task failed: {msg.data}")
456+
error_msg = (
457+
f"MiniMax returned task failed (trace_id: {current_trace_id}): {msg.data}"
458+
)
459+
logger.error(error_msg)
460+
raise APIError(error_msg)
387461

388462
else:
389463
logger.warning(f"unexpected Minimax message: {msg.data}")
@@ -404,13 +478,32 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
404478
await utils.aio.gracefully_cancel(*tasks)
405479

406480
except asyncio.TimeoutError:
407-
raise APITimeoutError() from None
481+
logger.error(
482+
f"MiniMax WebSocket request timeout after {self._conn_options.timeout}s, trace_id={trace_id}"
483+
)
484+
raise APITimeoutError(
485+
f"WebSocket TTS synthesis timed out after {self._conn_options.timeout}s (trace_id: {trace_id})"
486+
) from None
408487
except aiohttp.ClientResponseError as e:
488+
logger.error(
489+
f"WebSocket HTTP error: status={e.status}, message={e.message}, trace_id={trace_id}",
490+
exc_info=True,
491+
)
409492
raise APIStatusError(
410-
message=e.message, status_code=e.status, request_id=None, body=None
493+
message=f"WebSocket HTTP {e.status}: {e.message} (trace_id: {trace_id})",
494+
status_code=e.status,
495+
request_id=trace_id,
496+
body=None,
411497
) from e
412498
except Exception as e:
413-
raise APIConnectionError() from e
499+
if not isinstance(e, (APIStatusError, APITimeoutError, APIConnectionError)):
500+
logger.error(
501+
f"MiniMax WebSocket unexpected error: {type(e).__name__}: {e}, trace_id={trace_id}",
502+
exc_info=True,
503+
)
504+
raise APIConnectionError(
505+
f"WebSocket connection failed: {type(e).__name__}: {e} (trace_id: {trace_id})"
506+
) from e
414507

415508
async def aclose(self) -> None:
416509
await super().aclose()
@@ -448,8 +541,23 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
448541
) as resp:
449542
resp.raise_for_status()
450543

544+
# Extract trace_id from response headers (all requests have this)
545+
# Note: api.minimax.io also returns trace_id in response body root.trace_id
546+
trace_id = resp.headers.get("Trace-Id") or resp.headers.get("X-Trace-Id")
547+
minimax_request_id = resp.headers.get("Minimax-Request-Id")
548+
549+
if trace_id:
550+
logger.debug(
551+
f"MiniMax HTTP stream request started, trace_id={trace_id}, minimax_request_id={minimax_request_id}"
552+
)
553+
else:
554+
trace_id = utils.shortuuid()
555+
logger.warning(
556+
f"No Trace-Id in response headers, using generated ID: {trace_id}"
557+
)
558+
451559
output_emitter.initialize(
452-
request_id=utils.shortuuid(),
560+
request_id=trace_id,
453561
sample_rate=self._opts.sample_rate,
454562
num_channels=1,
455563
mime_type=f"audio/{self._opts.audio_format}",
@@ -464,24 +572,53 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
464572
continue
465573

466574
data = json.loads(line[5:])
575+
576+
# api.minimax.io returns trace_id in response body root level
577+
body_trace_id = data.get("trace_id")
578+
if body_trace_id and body_trace_id != trace_id:
579+
logger.debug(f"Found trace_id in response body: {body_trace_id}")
580+
467581
if audio := data.get("data", {}).get("audio"):
468582
output_emitter.push(bytes.fromhex(audio))
469-
elif (status_code := data.get("base_resp", {}).get("status_code")) != 0:
470-
raise APIStatusError(
471-
f"Minimax returned non-zero status code: {status_code}",
472-
request_id=None,
473-
body=data,
474-
)
583+
else:
584+
base_resp = data.get("base_resp", {})
585+
status_code = base_resp.get("status_code", 0)
586+
if status_code != 0:
587+
status_msg = base_resp.get("status_msg", "Unknown error")
588+
# trace_id priority: response body top level > response headers
589+
error_trace_id = body_trace_id or trace_id
590+
591+
logger.error(
592+
f"MiniMax HTTP stream error: code={status_code}, msg={status_msg}, trace_id={error_trace_id}",
593+
extra={"full_response": data},
594+
)
595+
596+
raise APIStatusError(
597+
f"MiniMax error [{status_code}]: {status_msg} (trace_id: {error_trace_id})",
598+
request_id=error_trace_id,
599+
body=data,
600+
)
475601
output_emitter.flush()
476602

477603
except asyncio.TimeoutError:
478-
raise APITimeoutError() from None
604+
logger.error(f"Minimax HTTP stream request timeout after {self._conn_options.timeout}s")
605+
raise APITimeoutError(
606+
f"TTS synthesis timed out after {self._conn_options.timeout}s"
607+
) from None
479608
except aiohttp.ClientResponseError as e:
609+
logger.error(f"HTTP error: status={e.status}, message={e.message}", exc_info=True)
480610
raise APIStatusError(
481-
message=e.message, status_code=e.status, request_id=None, body=None
482-
) from None
611+
message=f"HTTP {e.status}: {e.message}",
612+
status_code=e.status,
613+
request_id=None,
614+
body=None,
615+
) from e
483616
except Exception as e:
484-
raise APIConnectionError() from e
617+
if not isinstance(e, (APIStatusError, APITimeoutError, APIConnectionError)):
618+
logger.error(
619+
f"Minimax TTS unexpected error: {type(e).__name__}: {e}", exc_info=True
620+
)
621+
raise APIConnectionError(f"Connection failed: {type(e).__name__}: {e}") from e
485622

486623

487624
def _to_minimax_options(opts: _TTSOptions) -> dict[str, Any]:
@@ -499,6 +636,7 @@ def _to_minimax_options(opts: _TTSOptions) -> dict[str, Any]:
499636
"format": opts.audio_format,
500637
"channel": 1,
501638
},
639+
"text_normalization": opts.text_normalization,
502640
}
503641

504642
if opts.emotion is not None:

0 commit comments

Comments
 (0)