Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,51 @@
]

TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo", "sonic-3"]
TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
TTSLanguages = Literal[
"en",
"es",
"fr",
"de",
"pt",
"zh",
"ja",
"hi",
"ko",
"it",
"nl",
"pl",
"ru",
"sv",
"tr",
"tl",
"bg",
"ro",
"ar",
"cs",
"el",
"fi",
"hr",
"ms",
"sk",
"da",
"ta",
"uk",
"hu",
"no",
"vi",
"bn",
"th",
"he",
"ka",
"id",
"te",
"gu",
"kn",
"ml",
"mr",
"or",
"pa",
]
TTSDefaultVoiceId = "f786b574-daa5-4673-aa0c-cbe3e8534c02" # Katie - Friendly Fixer
TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class _TTSOptions:
emotion: list[TTSVoiceEmotion | str] | None
volume: float | None
word_timestamps: bool
add_phoneme_timestamps: bool
use_normalized_timestamps: bool
api_key: str
language: LanguageCode | None
base_url: str
Expand Down Expand Up @@ -97,6 +99,8 @@ def __init__(
volume: float | None = None,
sample_rate: int = 24000,
word_timestamps: bool = True,
add_phoneme_timestamps: bool = False,
use_normalized_timestamps: bool = False,
pronunciation_dict_id: str | None = None,
http_session: aiohttp.ClientSession | None = None,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
Expand All @@ -119,6 +123,8 @@ def __init__(
volume (float, optional): Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
add_phoneme_timestamps (bool, optional): Whether to add phoneme-level timestamps to the output. Defaults to False.
use_normalized_timestamps (bool, optional): Whether to use normalized timestamps. Defaults to False.
pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations. Defaults to None.
api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
Expand Down Expand Up @@ -157,6 +163,8 @@ def __init__(
api_key=cartesia_api_key,
base_url=base_url,
word_timestamps=word_timestamps,
add_phoneme_timestamps=add_phoneme_timestamps,
use_normalized_timestamps=use_normalized_timestamps,
api_version=api_version,
pronunciation_dict_id=pronunciation_dict_id,
)
Expand Down Expand Up @@ -243,6 +251,8 @@ def update_options(
emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
volume: NotGivenOr[float] = NOT_GIVEN,
pronunciation_dict_id: NotGivenOr[str] = NOT_GIVEN,
add_phoneme_timestamps: NotGivenOr[bool] = NOT_GIVEN,
use_normalized_timestamps: NotGivenOr[bool] = NOT_GIVEN,
api_version: NotGivenOr[str] = NOT_GIVEN,
) -> None:
"""
Expand All @@ -258,6 +268,8 @@ def update_options(
speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations.
add_phoneme_timestamps (bool, optional): Whether to add phoneme-level timestamps to the output.
use_normalized_timestamps (bool, optional): Whether to use normalized timestamps.
"""
if is_given(model):
self._opts.model = model
Expand All @@ -274,6 +286,10 @@ def update_options(
self._opts.volume = volume
if is_given(pronunciation_dict_id):
self._opts.pronunciation_dict_id = pronunciation_dict_id
if is_given(add_phoneme_timestamps):
self._opts.add_phoneme_timestamps = add_phoneme_timestamps
if is_given(use_normalized_timestamps):
self._opts.use_normalized_timestamps = use_normalized_timestamps
if is_given(api_version):
self._opts.api_version = api_version

Expand Down Expand Up @@ -503,6 +519,8 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse, cartesia_context_id: s
output_emitter.push_timed_transcript(
TimedString(text=word, start_time=start, end_time=end)
)
elif data.get("phoneme_timestamps"):
pass # phoneme_timestamps are received but not surfaced by the output emitter
elif data.get("type") == "error":
logger.error(
"Cartesia returned error. Include the cartesia_context_id to support@cartesia.ai for help debugging.",
Expand Down Expand Up @@ -594,5 +612,9 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any

if streaming:
options["add_timestamps"] = opts.word_timestamps
if opts.add_phoneme_timestamps:
options["add_phoneme_timestamps"] = True
if opts.use_normalized_timestamps:
options["use_normalized_timestamps"] = True

return options
Loading