Skip to content

Commit 7eb880c

Browse files
committed
Add DeepgramHttpTTSService
1 parent 4fa0de6 commit 7eb880c

File tree

4 files changed

+248
-0
lines changed

4 files changed

+248
-0
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- Added a new `DeepgramHttpTTSService`, which delivers a meaningful reduction
13+
in latency when compared to the `DeepgramTTSService`.
14+
1215
- Add support for `speaking_rate` input parameter in `GoogleHttpTTSService`.
1316

1417
- Added `enable_speaker_diarization` and `enable_language_identification` to
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#
2+
# Copyright (c) 2024–2025, Daily
3+
#
4+
# SPDX-License-Identifier: BSD 2-Clause License
5+
#
6+
7+
8+
import os
9+
10+
import aiohttp
11+
from dotenv import load_dotenv
12+
from loguru import logger
13+
14+
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
15+
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
16+
from pipecat.audio.vad.silero import SileroVADAnalyzer
17+
from pipecat.audio.vad.vad_analyzer import VADParams
18+
from pipecat.frames.frames import LLMRunFrame
19+
from pipecat.pipeline.pipeline import Pipeline
20+
from pipecat.pipeline.runner import PipelineRunner
21+
from pipecat.pipeline.task import PipelineParams, PipelineTask
22+
from pipecat.processors.aggregators.llm_context import LLMContext
23+
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
24+
from pipecat.runner.types import RunnerArguments
25+
from pipecat.runner.utils import create_transport
26+
from pipecat.services.deepgram.stt import DeepgramSTTService
27+
from pipecat.services.deepgram.tts import DeepgramHttpTTSService
28+
from pipecat.services.openai.llm import OpenAILLMService
29+
from pipecat.transports.base_transport import BaseTransport, TransportParams
30+
from pipecat.transports.daily.transport import DailyParams
31+
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
32+
33+
load_dotenv(override=True)
34+
35+
36+
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
37+
# instantiated. The function will be called when the desired transport gets
38+
# selected.
39+
transport_params = {
40+
"daily": lambda: DailyParams(
41+
audio_in_enabled=True,
42+
audio_out_enabled=True,
43+
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
44+
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
45+
),
46+
"twilio": lambda: FastAPIWebsocketParams(
47+
audio_in_enabled=True,
48+
audio_out_enabled=True,
49+
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
50+
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
51+
),
52+
"webrtc": lambda: TransportParams(
53+
audio_in_enabled=True,
54+
audio_out_enabled=True,
55+
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
56+
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
57+
),
58+
}
59+
60+
61+
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
62+
logger.info(f"Starting bot")
63+
64+
async with aiohttp.ClientSession() as session:
65+
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
66+
67+
tts = DeepgramHttpTTSService(
68+
api_key=os.getenv("DEEPGRAM_API_KEY"),
69+
voice="aura-2-andromeda-en",
70+
aiohttp_session=session,
71+
)
72+
73+
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
74+
75+
messages = [
76+
{
77+
"role": "system",
78+
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
79+
},
80+
]
81+
82+
context = LLMContext(messages)
83+
context_aggregator = LLMContextAggregatorPair(context)
84+
85+
pipeline = Pipeline(
86+
[
87+
transport.input(), # Transport user input
88+
stt, # STT
89+
context_aggregator.user(), # User responses
90+
llm, # LLM
91+
tts, # TTS
92+
transport.output(), # Transport bot output
93+
context_aggregator.assistant(), # Assistant spoken responses
94+
]
95+
)
96+
97+
task = PipelineTask(
98+
pipeline,
99+
params=PipelineParams(
100+
enable_metrics=True,
101+
enable_usage_metrics=True,
102+
),
103+
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
104+
)
105+
106+
@transport.event_handler("on_client_connected")
107+
async def on_client_connected(transport, client):
108+
logger.info(f"Client connected")
109+
# Kick off the conversation.
110+
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
111+
await task.queue_frames([LLMRunFrame()])
112+
113+
@transport.event_handler("on_client_disconnected")
114+
async def on_client_disconnected(transport, client):
115+
logger.info(f"Client disconnected")
116+
await task.cancel()
117+
118+
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
119+
120+
await runner.run(task)
121+
122+
123+
async def bot(runner_args: RunnerArguments):
124+
"""Main bot entry point compatible with Pipecat Cloud."""
125+
transport = await create_transport(runner_args, transport_params)
126+
await run_bot(transport, runner_args)
127+
128+
129+
if __name__ == "__main__":
130+
from pipecat.runner.run import main
131+
132+
main()

scripts/evals/run-release-evals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
8787
("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
8888
("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
8989
("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
90+
("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
9091
("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
9192
("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
9293
("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),

src/pipecat/services/deepgram/tts.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from typing import AsyncGenerator, Optional
1414

15+
import aiohttp
1516
from loguru import logger
1617

1718
from pipecat.frames.frames import (
@@ -117,3 +118,114 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
117118
except Exception as e:
118119
logger.exception(f"{self} exception: {e}")
119120
yield ErrorFrame(f"Error getting audio: {str(e)}")
121+
122+
123+
class DeepgramHttpTTSService(TTSService):
124+
"""Deepgram HTTP text-to-speech service.
125+
126+
Provides text-to-speech synthesis using Deepgram's HTTP TTS API.
127+
Supports various voice models and audio encoding formats with
128+
configurable sample rates and quality settings.
129+
"""
130+
131+
def __init__(
132+
self,
133+
*,
134+
api_key: str,
135+
voice: str = "aura-2-helena-en",
136+
aiohttp_session: aiohttp.ClientSession,
137+
base_url: str = "https://api.deepgram.com",
138+
sample_rate: Optional[int] = None,
139+
encoding: str = "linear16",
140+
**kwargs,
141+
):
142+
"""Initialize the Deepgram TTS service.
143+
144+
Args:
145+
api_key: Deepgram API key for authentication.
146+
voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
147+
aiohttp_session: Shared aiohttp session for HTTP requests with connection pooling.
148+
base_url: Custom base URL for Deepgram API. Defaults to "https://api.deepgram.com".
149+
sample_rate: Audio sample rate in Hz. If None, uses service default.
150+
encoding: Audio encoding format. Defaults to "linear16".
151+
**kwargs: Additional arguments passed to parent TTSService class.
152+
"""
153+
super().__init__(sample_rate=sample_rate, **kwargs)
154+
155+
self._api_key = api_key
156+
self._session = aiohttp_session
157+
self._base_url = base_url
158+
self._settings = {
159+
"encoding": encoding,
160+
}
161+
self.set_voice(voice)
162+
163+
def can_generate_metrics(self) -> bool:
164+
"""Check if the service can generate metrics.
165+
166+
Returns:
167+
True, as Deepgram TTS service supports metrics generation.
168+
"""
169+
return True
170+
171+
@traced_tts
172+
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
173+
"""Generate speech from text using Deepgram's TTS API.
174+
175+
Args:
176+
text: The text to synthesize into speech.
177+
178+
Yields:
179+
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
180+
"""
181+
logger.debug(f"{self}: Generating TTS [{text}]")
182+
183+
# Build URL with parameters
184+
url = f"{self._base_url}/v1/speak"
185+
186+
headers = {"Authorization": f"Token {self._api_key}", "Content-Type": "application/json"}
187+
188+
params = {
189+
"model": self._voice_id,
190+
"encoding": self._settings["encoding"],
191+
"sample_rate": self.sample_rate,
192+
"container": "none",
193+
}
194+
195+
payload = {
196+
"text": text,
197+
}
198+
199+
try:
200+
await self.start_ttfb_metrics()
201+
202+
async with self._session.post(
203+
url, headers=headers, json=payload, params=params
204+
) as response:
205+
if response.status != 200:
206+
error_text = await response.text()
207+
raise Exception(f"HTTP {response.status}: {error_text}")
208+
209+
await self.start_tts_usage_metrics(text)
210+
yield TTSStartedFrame()
211+
212+
CHUNK_SIZE = self.chunk_size
213+
214+
first_chunk = True
215+
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
216+
if first_chunk:
217+
await self.stop_ttfb_metrics()
218+
first_chunk = False
219+
220+
if chunk:
221+
yield TTSAudioRawFrame(
222+
audio=chunk,
223+
sample_rate=self.sample_rate,
224+
num_channels=1,
225+
)
226+
227+
yield TTSStoppedFrame()
228+
229+
except Exception as e:
230+
logger.exception(f"{self} exception: {e}")
231+
yield ErrorFrame(f"Error getting audio: {str(e)}")

0 commit comments

Comments
 (0)