Blaizzy · jarodise · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -70,3 +70,4 @@ xcuserdata/
 # uv
 .build
 claude.md
+logs/
diff --git a/MLX_AUDIO_GUIDE.md b/MLX_AUDIO_GUIDE.md
@@ -0,0 +1,126 @@
+# Chatterbox TTS Quick Reference
+
+Multilingual voice cloning TTS using mlx-audio on Apple Silicon.
+
+> **Note:** mlx-audio is already installed system-wide. Commands work from any folder.
+
+
+---
+
+## Web UI Usage
+
+The MLX Audio Studio provides a user-friendly interface for Text-to-Speech generation, including full support for Chatterbox voice cloning.
+
+### 1. Starting the Server
+
+Run the following command to start both the API server and the Web UI:
+
+```bash
+python -m mlx_audio.server --start-ui
+```
+
+- **UI URL:** `http://localhost:3000`
+- **API URL:** `http://localhost:8000`
+
+### 2. Using Chatterbox TTS
+
+1. **Select Model:** Choose `Chatterbox` from the **Model** dropdown menu.
+2. **Reference Audio:** Click "Choose audio file..." to upload a 5-10 second clip of the voice you want to clone.
+   - *Note:* This is required for Chatterbox to work.
+3. **Select Language:** Choose the target language from the **Language** dropdown (e.g., English, Chinese, Japanese).
+   - *Important:* Ensure this matches the text you are generating to avoid pronunciation issues.
+4. **Adjust Settings:**
+   - **Emotion Exaggeration:** Controls expressiveness (0.0 - 1.0). Default is 0.5. Higher values make the voice more emotional but can be unstable.
+   - **Guidance Weight:** Controls how closely the model follows the text/audio conditioning. Default is 0.5.
+5. **Generate:** Enter your text and click **Generate**.
+6. **Download:** Click the download icon to save the generated audio as an MP3 file.
+
+### 3. Troubleshooting
+
+- **"Failed to fetch"**: If generation takes too long (Chatterbox is slow), the UI might timeout. We've increased the timeout to 5 minutes for Chatterbox.
+- **Pronunciation issues**: If words sound wrong (e.g., "But I" -> "boot yi"), check that the **Language** dropdown is set correctly (e.g., to "English").
+
+---
+
+## CLI Usage
+
+### Basic Chinese TTS
+
+```bash
+mlx_audio.tts.generate \
+  --model mlx-community/chatterbox-fp16 \
+  --text "你好，今天天气真不错！" \
+  --lang_code zh \
+  --ref_audio /path/to/reference_voice.mp3 \
+  --file_prefix output
+```
+
+### All CLI Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--model` | Model path (use `mlx-community/chatterbox-fp16`) | Required |
+| `--text` | Text to synthesize | Required |
+| `--lang_code` | Language code (see below) | `en` |
+| `--ref_audio` | Path to reference audio for voice cloning | Required |
+| `--file_prefix` | Output filename prefix | `audio` |
+| `--audio_format` | Output format: `wav`, `mp3`, `flac` | `wav` |
+| `--exaggeration` | Emotion intensity (0.0-1.0) | `0.5` |
+| `--cfg_scale` | Classifier-free guidance (lower = more stable) | `0.5` |
+| `--temperature` | Sampling temperature | `0.8` |
+| `--max_tokens` | Max tokens to generate (lower = faster) | `1000` |
+| `--verbose` | Print detailed output | `false` |
+| `--play` | Play audio after generation | `false` |
+
+---
+
+## Python API
+
+```python
+import mlx.core as mx
+import soundfile as sf
+import librosa
+from mlx_audio.tts.utils import load_model
+
+# Load multilingual Chatterbox model
+model = load_model('mlx-community/chatterbox-fp16')
+
+# Load reference audio (for voice cloning)
+audio, sr = librosa.load('/path/to/reference.mp3', sr=24000, mono=True)
+conds = model.prepare_conditionals(mx.array(audio), sr, exaggeration=0.5)
+
+# Generate Chinese speech
+for result in model.generate(
+    text="你好，我是语音合成模型。",
+    conds=conds,
+    lang_code='zh',
+    max_new_tokens=300
+):
+    sf.write('output.wav', result.audio, result.sample_rate)
+```
+
+---
+
+## Language Codes
+
+| Code | Language | Code | Language |
+|------|----------|------|----------|
+| `zh` | Chinese | `ja` | Japanese |
+| `en` | English | `ko` | Korean |
+| `es` | Spanish | `ar` | Arabic |
+| `fr` | French | `hi` | Hindi |
+| `de` | German | `ru` | Russian |
+| `it` | Italian | `pt` | Portuguese |
+| `nl` | Dutch | `pl` | Polish |
+| `tr` | Turkish | `sv` | Swedish |
+
+Full list: `ar`, `da`, `de`, `el`, `en`, `es`, `fi`, `fr`, `he`, `hi`, `it`, `ja`, `ko`, `ms`, `nl`, `no`, `pl`, `pt`, `ru`, `sv`, `sw`, `tr`, `zh`
+
+---
+
+## Tips
+
+1. **Reference audio**: Use 3-10 seconds of clean speech
+2. **First run**: Downloads 2.6GB model (~5-10 min)
+3. **Faster generation**: Use `--max_tokens 200` for short text
+4. **exaggeration**: Higher = more emotional expression
diff --git a/mlx_audio/server.py b/mlx_audio/server.py
@@ -96,7 +96,6 @@ async def get_available_models(self):
 
 
 def int_or_float(value):
-
     try:
         return int(value)
     except ValueError:
@@ -164,6 +163,9 @@ class SpeechRequest(BaseModel):
     top_k: int | None = 40
     repetition_penalty: float | None = 1.0
     response_format: str | None = "mp3"
+    # Chatterbox-specific parameters
+    exaggeration: float | None = 0.5
+    cfg_weight: float | None = 0.5
 
 
 # Initialize the ModelProvider
@@ -234,26 +236,69 @@ async def remove_model(model_name: str):
 
 
 async def generate_audio(model, payload: SpeechRequest, verbose: bool = False):
-    for result in model.generate(
-        payload.input,
-        voice=payload.voice,
-        speed=payload.speed,
-        gender=payload.gender,
-        pitch=payload.pitch,
-        lang_code=payload.lang_code,
-        ref_audio=payload.ref_audio,
-        ref_text=payload.ref_text,
-        temperature=payload.temperature,
-        top_p=payload.top_p,
-        top_k=payload.top_k,
-        repetition_penalty=payload.repetition_penalty,
-    ):
-
-        sample_rate = result.sample_rate
-        buffer = io.BytesIO()
-        sf.write(buffer, result.audio, sample_rate, format=payload.response_format)
-        buffer.seek(0)
-        yield buffer.getvalue()
+    import base64
+    import tempfile
+
+    # Handle base64 ref_audio - decode to temp file
+    ref_audio = payload.ref_audio
+    temp_audio_path = None
+
+    if ref_audio and ref_audio.startswith("data:"):
+        # Parse data URL: data:audio/wav;base64,XXXX...
+        try:
+            header, data = ref_audio.split(",", 1)
+            audio_bytes = base64.b64decode(data)
+
+            # Determine file extension from MIME type
+            ext = ".wav"
+            if "audio/mp3" in header or "audio/mpeg" in header:
+                ext = ".mp3"
+            elif "audio/ogg" in header:
+                ext = ".ogg"
+
+            # Save to temp file
+            with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f:
+                f.write(audio_bytes)
+                temp_audio_path = f.name
+                ref_audio = temp_audio_path
+        except Exception as e:
+            print(f"Error decoding base64 audio: {e}")
+            ref_audio = None
+
+    # Debug logging
+    if ref_audio:
+        print(
+            f"[DEBUG] Using ref_audio: {ref_audio[:50] if isinstance(ref_audio, str) else 'array'}..."
+        )
+    else:
+        print("[DEBUG] No ref_audio provided - will use default voice")
+
+    try:
+        for result in model.generate(
+            payload.input,
+            voice=payload.voice,
+            speed=payload.speed,
+            gender=payload.gender,
+            pitch=payload.pitch,
+            lang_code=payload.lang_code,
+            ref_audio=ref_audio,
+            ref_text=payload.ref_text,
+            temperature=payload.temperature,
+            top_p=payload.top_p,
+            top_k=payload.top_k,
+            repetition_penalty=payload.repetition_penalty,
+            exaggeration=payload.exaggeration,
+            cfg_weight=payload.cfg_weight,
+        ):
+            sample_rate = result.sample_rate
+            buffer = io.BytesIO()
+            sf.write(buffer, result.audio, sample_rate, format=payload.response_format)
+            buffer.seek(0)
+            yield buffer.getvalue()
+    finally:
+        # Clean up temp file
+        if temp_audio_path and os.path.exists(temp_audio_path):
+            os.remove(temp_audio_path)
 
 
 @app.post("/v1/audio/speech")
@@ -395,7 +440,7 @@ async def stt_realtime_transcriptions(websocket: WebSocket):
                     if len(audio_buffer) % (sample_rate * 2) < len(audio_chunk_float):
                         # Log every ~2 seconds of buffer
                         print(
-                            f"Speech detected ({speech_frames}/{num_frames} frames): buffer {len(audio_buffer)} samples ({len(audio_buffer)/sample_rate:.2f}s)"
+                            f"Speech detected ({speech_frames}/{num_frames} frames): buffer {len(audio_buffer)} samples ({len(audio_buffer) / sample_rate:.2f}s)"
                         )
                 else:
                     silence_skip_count += 1
@@ -420,7 +465,7 @@ async def stt_realtime_transcriptions(websocket: WebSocket):
                     ):
                         should_process_initial = True
                         print(
-                            f"Processing initial chunk for real-time feedback: {initial_chunk_size/sample_rate:.2f}s, total buffer: {len(audio_buffer)/sample_rate:.2f}s"
+                            f"Processing initial chunk for real-time feedback: {initial_chunk_size / sample_rate:.2f}s, total buffer: {len(audio_buffer) / sample_rate:.2f}s"
                         )
                     # Process if we have enough silence after speech (end of utterance)
                     elif (
@@ -429,13 +474,13 @@ async def stt_realtime_transcriptions(websocket: WebSocket):
                     ):
                         should_process_final = True
                         print(
-                            f"Processing due to silence gap: {time_since_last_speech:.2f}s silence, buffer: {len(audio_buffer)/sample_rate:.2f}s"
+                            f"Processing due to silence gap: {time_since_last_speech:.2f}s silence, buffer: {len(audio_buffer) / sample_rate:.2f}s"
                         )
                     # Or if buffer is getting too large (continuous speech)
                     elif len(audio_buffer) >= max_chunk_size:
                         should_process_final = True
                         print(
-                            f"Processing due to max buffer size: {len(audio_buffer)/sample_rate:.2f}s"
+                            f"Processing due to max buffer size: {len(audio_buffer) / sample_rate:.2f}s"
                         )
 
                 # Process initial chunk for real-time feedback
@@ -542,7 +587,7 @@ async def stt_realtime_transcriptions(websocket: WebSocket):
                         initial_chunk_processed = False
                         last_process_time = current_time
                         print(
-                            f"Processed final chunk: {process_size} samples ({process_size/sample_rate:.2f}s), buffer cleared"
+                            f"Processed final chunk: {process_size} samples ({process_size / sample_rate:.2f}s), buffer cleared"
                         )
 
                     except Exception as e:
-Original file line number
+Diff line change
@@ Expand Up / @@ -70,3 +70,4 @@ xcuserdata/ @@
     # uv
     .build
     claude.md
+    logs/