- 
                Notifications
    You must be signed in to change notification settings 
- Fork 2.8k
Open
Labels
Description
Describe the bug
When using Agents SDK with custom Text-to-Speech model provider (ElevenLabs) for audio streaming in mp3 format, I encounter ValueError when the length of an audio chunk is odd.
The error stems from _transform_audio_buffer in StreamedAudioResult class.
Debug information
- Agents SDK version:0.3.2
- Python version 3.13
- Numpy 2.3.3
- elevenlabs 2.16.0
Repro steps
To run this script, you will need ElevenLabs API key.
Audio format requested: MP3 with sample rate 44100 and bit rate of 96.
When an additional byte is added in case when chunk length is odd, the error disappears.
# voice.py
import asyncio
from collections.abc import AsyncIterator
import os
import numpy as np
from typing import Final
from agents.voice import (
    TTSModelSettings,
    VoicePipelineConfig
)
from agents.voice.events import (
    VoiceStreamEventAudio,
    VoiceStreamEventError,
    VoiceStreamEventLifecycle,
)
from agents.voice import TTSModel, TTSModelSettings
from agents.voice.result import StreamedAudioResult
from elevenlabs.client import ElevenLabs
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
ELEVEN_MULTILINGUAL_V2: Final[str] = "eleven_multilingual_v2"
# just a random voice to use for the example
DEFAULT_VOICE: Final[str] = "21m00Tcm4TlvDq8ikWAM"
class ElevenlabsModel(TTSModel):
    """A text-to-speech model that can convert text into audio output."""
    def __init__(
        self,
    ) -> None:
        super().__init__()
        self._elevenlabs = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) # provide your ElevenLabs API key
    @property
    def model_name(self) -> str:
        """The name of the TTS model."""
        return ELEVEN_MULTILINGUAL_V2
    def run(self, text: str, settings: TTSModelSettings) -> AsyncIterator[bytes]:
        """Given a text string, produces a stream of audio bytes.
        Args:
            text: The text to convert to audio.
        Returns:
            An async iterator of audio bytes.
        """
        voice_id = DEFAULT_VOICE
        output_format = "mp3_44100_96" # MP3 with sample rate 44100 and bit rate of 96
        async def _async_stream() -> AsyncIterator[bytes]:
            stream = self._elevenlabs.text_to_speech.stream(
                text=text,
                output_format=output_format,
                voice_id=voice_id,
                model_id=self.model_name,
                language_code=None,
            )
            cur_chunk: bytes = bytes()
            for chunk in stream:
                if len(cur_chunk) > 0:
                    yield cur_chunk
                await asyncio.sleep(0)
                cur_chunk = bytes(chunk)
            if len(cur_chunk) > 0:
                if len(cur_chunk) % 2 != 0:
                    print(f"Warning: Final chunk has odd length {len(cur_chunk)}, padding")
                    # cur_chunk += b"\x00"  # If we pad with zero byte, the error disappears
                yield cur_chunk
        return _async_stream()
async def main():
    output = StreamedAudioResult(
        ElevenlabsModel(),
        TTSModelSettings(),
        VoicePipelineConfig(),
    )
    # Pick a filename in the current directory (timestamped to avoid overwrites)
    filename = f"tts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
    path = os.path.join(os.getcwd(), filename)
    # Play the audio stream as it comes in
    await output._add_text(""""Hey, how are you doing today?""")
    await output._turn_done()
    await output._done()
    with open(path, "wb") as f:
        async for event in output.stream():
            match event:
                case VoiceStreamEventAudio():
                    byte_data = np.ascontiguousarray(event.data).tobytes()
                    f.write(byte_data)   # raw MP3 bytes
                case VoiceStreamEventLifecycle():
                    print(event.event)
                case VoiceStreamEventError():
                    print(event.error)
                    break
                case _:
                    break
if __name__ == "__main__":
    asyncio.run(main())Error Stack
Error streaming audio: buffer size must be a multiple of element size
Traceback (most recent call last):
  File "voice.py", line 107, in <module>
    asyncio.run(main())
  File "<python stdlib>/asyncio/runners.py", line 195, in run
    return runner.run(main)
  File "<python stdlib>/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
  File "<python stdlib>/asyncio/base_events.py", line 719, in run_until_complete
    return future.result()
  File "voice.py", line 92, in main
    await output._turn_done()
  File "<site-packages>/agents/voice/result.py", line 201, in _turn_done
    await asyncio.gather(*self._tasks)
  File "<site-packages>/agents/voice/result.py", line 168, in _stream_audio
    raise e
  File "<site-packages>/agents/voice/result.py", line 139, in _stream_audio
    audio_np = self._transform_audio_buffer(buffer, self.tts_settings.dtype)
  File "<site-packages>/agents/voice/result.py", line 91, in _transform_audio_buffer
    np_array = np.frombuffer(b"".join(buffer), dtype=np.int16)
ValueError: buffer size must be a multiple of element size
Expected behavior
Audio should play without errors regardless of parity of bytes.