From 481395c87740c80beb02662ba0f672fd2bf5c916 Mon Sep 17 00:00:00 2001 From: Angelo Giacco Date: Wed, 5 Feb 2025 22:42:47 +0000 Subject: [PATCH 1/3] feat: asynciterable support --- src/elevenlabs/client.py | 57 ++++++++++++---- src/elevenlabs/realtime_tts.py | 116 ++++++++++++++++++++++++++++++++- tests/test_async_generation.py | 22 +++++++ 3 files changed, 181 insertions(+), 14 deletions(-) create mode 100644 tests/test_async_generation.py diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py index d5c3ee73..ccbbf822 100644 --- a/src/elevenlabs/client.py +++ b/src/elevenlabs/client.py @@ -5,7 +5,7 @@ import httpx from typing import Iterator, Optional, Union, \ - Optional, AsyncIterator + Optional, AsyncIterable, AsyncIterator from .base_client import \ BaseElevenLabs, AsyncBaseElevenLabs @@ -13,7 +13,7 @@ from .types import Voice, VoiceSettings, \ PronunciationDictionaryVersionLocator, Model from .environment import ElevenLabsEnvironment -from .realtime_tts import RealtimeTextToSpeechClient +from .realtime_tts import RealtimeTextToSpeechClient, AsyncRealtimeTextToSpeechClient from .types import OutputFormat @@ -257,6 +257,25 @@ class AsyncElevenLabs(AsyncBaseElevenLabs): api_key="YOUR_API_KEY", ) """ + def __init__( + self, + *, + base_url: typing.Optional[str] = None, + environment: ElevenLabsEnvironment = ElevenLabsEnvironment.PRODUCTION, + api_key: typing.Optional[str] = os.getenv("ELEVENLABS_API_KEY"), + timeout: typing.Optional[float] = None, + follow_redirects: typing.Optional[bool] = True, + httpx_client: typing.Optional[httpx.AsyncClient] = None + ): + super().__init__( + base_url=base_url, + environment=environment, + api_key=api_key, + timeout=timeout, + follow_redirects=follow_redirects, + httpx_client=httpx_client, + ) + self.text_to_speech = AsyncRealtimeTextToSpeechClient(client_wrapper=self._client_wrapper) async def clone( self, @@ -299,7 +318,7 @@ async def clone( async def generate( self, *, - text: str, + text: Union[str, AsyncIterable[str]], voice: Union[VoiceId, VoiceName, Voice] = DEFAULT_VOICE, voice_settings: typing.Optional[VoiceSettings] = DEFAULT_VOICE.settings, model: Union[ModelId, Model] = "eleven_multilingual_v2", @@ -383,16 +402,28 @@ async def generate( model_id = model.model_id if stream: - return self.text_to_speech.convert_as_stream( - voice_id=voice_id, - model_id=model_id, - voice_settings=voice_settings, - optimize_streaming_latency=optimize_streaming_latency, - output_format=output_format, - text=text, - request_options=request_options, - pronunciation_dictionary_locators=pronunciation_dictionary_locators - ) + if isinstance(text, AsyncIterable): + return self.text_to_speech.convert_realtime( # type: ignore + voice_id=voice_id, + voice_settings=voice_settings, + output_format=output_format, + text=text, + request_options=request_options, + model_id=model_id + ) + elif isinstance(text, str): + return self.text_to_speech.convert_as_stream( + voice_id=voice_id, + model_id=model_id, + voice_settings=voice_settings, + optimize_streaming_latency=optimize_streaming_latency, + output_format=output_format, + text=text, + request_options=request_options, + pronunciation_dictionary_locators=pronunciation_dictionary_locators + ) + else: + raise ApiError(body="Text is neither a string nor an iterator.") else: if not isinstance(text, str): raise ApiError(body="Text must be a string when stream is False.") diff --git a/src/elevenlabs/realtime_tts.py b/src/elevenlabs/realtime_tts.py index 799c9e53..54d8cd87 100644 --- a/src/elevenlabs/realtime_tts.py +++ b/src/elevenlabs/realtime_tts.py @@ -5,8 +5,10 @@ import json import base64 import websockets +import asyncio from websockets.sync.client import connect +from websockets.client import connect as async_connect from .core.api_error import ApiError from .core.client_wrapper import SyncClientWrapper @@ -14,7 +16,7 @@ from .core.remove_none_from_dict import remove_none_from_dict from .core.request_options import RequestOptions from .types.voice_settings import VoiceSettings -from .text_to_speech.client import TextToSpeechClient +from .text_to_speech.client import TextToSpeechClient, AsyncTextToSpeechClient from .types import OutputFormat # this is used as the default value for optional parameters @@ -39,6 +41,24 @@ def text_chunker(chunks: typing.Iterator[str]) -> typing.Iterator[str]: yield buffer + " " +async def async_text_chunker(chunks: typing.AsyncIterator[str]) -> typing.AsyncIterator[str]: + """Used during input streaming to chunk text blocks and set last char to space""" + splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ") + buffer = "" + async for text in chunks: + if buffer.endswith(splitters): + yield buffer if buffer.endswith(" ") else buffer + " " + buffer = text + elif text.startswith(splitters): + output = buffer + text[0] + yield output if output.endswith(" ") else output + " " + buffer = text[1:] + else: + buffer += text + if buffer != "": + yield buffer + " " + + class RealtimeTextToSpeechClient(TextToSpeechClient): def __init__(self, *, client_wrapper: SyncClientWrapper): super().__init__(client_wrapper=client_wrapper) @@ -141,3 +161,97 @@ def get_text() -> typing.Iterator[str]: raise ApiError(body=data, status_code=ce.code) elif ce.code != 1000: raise ApiError(body=ce.reason, status_code=ce.code) + +class AsyncRealtimeTextToSpeechClient(AsyncTextToSpeechClient): + async def convert_realtime( + self, + voice_id: str, + *, + text: typing.AsyncIterable[str], + model_id: typing.Optional[str] = OMIT, + output_format: typing.Optional[OutputFormat] = "mp3_44100_128", + voice_settings: typing.Optional[VoiceSettings] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> typing.AsyncIterator[bytes]: + """ + Converts text into speech using a voice of your choice and returns audio. + Parameters: + - voice_id: str. Voice ID to be used, you can use https://api.elevenlabs.io/v1/voices to list all the available voices. + + - text: typing.Iterator[str]. The text that will get converted into speech. + - model_id: typing.Optional[str]. Identifier of the model that will be used, you can query them using GET /v1/models. The model needs to have support for text to speech, you can check this using the can_do_text_to_speech property. + - voice_settings: typing.Optional[VoiceSettings]. Voice settings overriding stored setttings for the given voice. They are applied only on the given request. + - request_options: typing.Optional[RequestOptions]. Request-specific configuration. + --- + from elevenlabs import PronunciationDictionaryVersionLocator, VoiceSettings + from elevenlabs.client import ElevenLabs + def get_text() -> typing.Iterator[str]: + yield "Hello, how are you?" + yield "I am fine, thank you." + client = ElevenLabs( + api_key="YOUR_API_KEY", + ) + client.text_to_speech.convert_realtime( + voice_id="string", + text=get_text(), + model_id="string", + voice_settings=VoiceSettings( + stability=1.1, + similarity_boost=1.1, + style=1.1, + use_speaker_boost=True, + ), + ) + """ + async with async_connect( + urllib.parse.urljoin( + "wss://api.elevenlabs.io/", + f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}&output_format={output_format}" + ), + extra_headers=jsonable_encoder( + remove_none_from_dict( + { + **self._client_wrapper.get_headers(), + **(request_options.get("additional_headers", {}) if request_options is not None else {}), + } + ) + ) + ) as socket: + try: + await socket.send(json.dumps( + dict( + text=" ", + try_trigger_generation=True, + voice_settings=voice_settings.dict() if voice_settings else None, + generation_config=dict( + chunk_length_schedule=[50], + ), + ) + )) + except websockets.exceptions.ConnectionClosedError as ce: + raise ApiError(body=ce.reason, status_code=ce.code) + + try: + async for text_chunk in async_text_chunker(text): + data = dict(text=text_chunk, try_trigger_generation=True) + await socket.send(json.dumps(data)) + try: + async with asyncio.timeout(1e-2): + data = json.loads(await socket.recv()) + if "audio" in data and data["audio"]: + yield base64.b64decode(data["audio"]) # type: ignore + except TimeoutError: + pass + + await socket.send(json.dumps(dict(text=""))) + + while True: + + data = json.loads(await socket.recv()) + if "audio" in data and data["audio"]: + yield base64.b64decode(data["audio"]) # type: ignore + except websockets.exceptions.ConnectionClosed as ce: + if "message" in data: + raise ApiError(body=data, status_code=ce.code) + elif ce.code != 1000: + raise ApiError(body=ce.reason, status_code=ce.code) \ No newline at end of file diff --git a/tests/test_async_generation.py b/tests/test_async_generation.py new file mode 100644 index 00000000..eee6a8a0 --- /dev/null +++ b/tests/test_async_generation.py @@ -0,0 +1,22 @@ +import asyncio +import pytest + +from .utils import IN_GITHUB, async_client +from elevenlabs import play + +def test_generate_stream() -> None: + async def main(): + async def text_stream(): + yield "Hi there, I'm Eleven Labs " + yield "I'm an AI Audio Research Company " + + audio_stream = await async_client.generate( + text=text_stream(), + voice="Adam", + model="eleven_monolingual_v1", + stream=True + ) + + if not IN_GITHUB: + stream(audio_stream) # type: ignore + asyncio.run(main()) \ No newline at end of file From ca24caac0138922471318b30809f79950adaeef8 Mon Sep 17 00:00:00 2001 From: Angelo Giacco Date: Mon, 17 Feb 2025 16:54:23 +0000 Subject: [PATCH 2/3] fix: tests and async chunking correctly uses asynciterable --- .gitignore | 1 + src/elevenlabs/realtime_tts.py | 2 +- tests/test_async_generation.py | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0da665fe..030cd07b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ dist/ __pycache__/ poetry.toml .ruff_cache/ +env/ \ No newline at end of file diff --git a/src/elevenlabs/realtime_tts.py b/src/elevenlabs/realtime_tts.py index 54d8cd87..a6ee6140 100644 --- a/src/elevenlabs/realtime_tts.py +++ b/src/elevenlabs/realtime_tts.py @@ -41,7 +41,7 @@ def text_chunker(chunks: typing.Iterator[str]) -> typing.Iterator[str]: yield buffer + " " -async def async_text_chunker(chunks: typing.AsyncIterator[str]) -> typing.AsyncIterator[str]: +async def async_text_chunker(chunks: typing.AsyncIterable[str]) -> typing.AsyncIterable[str]: """Used during input streaming to chunk text blocks and set last char to space""" splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ") buffer = "" diff --git a/tests/test_async_generation.py b/tests/test_async_generation.py index eee6a8a0..9e4204df 100644 --- a/tests/test_async_generation.py +++ b/tests/test_async_generation.py @@ -1,9 +1,12 @@ import asyncio import pytest -from .utils import IN_GITHUB, async_client +from .utils import IN_GITHUB +from elevenlabs import AsyncElevenLabs from elevenlabs import play +async_client = AsyncElevenLabs() + def test_generate_stream() -> None: async def main(): async def text_stream(): From 4b002d8df44384b7a1aec73531dce8890353cb14 Mon Sep 17 00:00:00 2001 From: Angelo Giacco Date: Mon, 17 Feb 2025 17:04:53 +0000 Subject: [PATCH 3/3] fix in CI --- src/elevenlabs/realtime_tts.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/elevenlabs/realtime_tts.py b/src/elevenlabs/realtime_tts.py index a6ee6140..4573865a 100644 --- a/src/elevenlabs/realtime_tts.py +++ b/src/elevenlabs/realtime_tts.py @@ -236,10 +236,9 @@ def get_text() -> typing.Iterator[str]: data = dict(text=text_chunk, try_trigger_generation=True) await socket.send(json.dumps(data)) try: - async with asyncio.timeout(1e-2): - data = json.loads(await socket.recv()) - if "audio" in data and data["audio"]: - yield base64.b64decode(data["audio"]) # type: ignore + async with json.loads(await asyncio.wait_for(socket.recv(), timeout=1e-2)): + if "audio" in data and data["audio"]: + yield base64.b64decode(data["audio"]) # type: ignore except TimeoutError: pass