Refactor DeepgramTTSService to use HTTP directly

markbackman · markbackman · commit 8461208fe8b2 · 2025-10-30T15:06:24.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -151,6 +151,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Refactor `DeepgramTTSService` to use a direct HTTP connection. This results
+  in a significant TTFB reduction when compared to using the Deepgram python
+  SDK.
+
+  Note: an `aiohttp_session` is now required when initializing
+  `DeepgramTTSService`.
+
 - `DailyTransport` triggers `on_error` event if transcription can't be started
   or stopped.
 
diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py
@@ -7,6 +7,7 @@
 
 import os
 
+import aiohttp
 from dotenv import load_dotenv
 from loguru import logger
 
@@ -60,58 +61,63 @@
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     logger.info(f"Starting bot")
 
-    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+    async with aiohttp.ClientSession() as session:
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
-    tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
+        tts = DeepgramTTSService(
+            api_key=os.getenv("DEEPGRAM_API_KEY"),
+            voice="aura-2-andromeda-en",
+            aiohttp_session=session,
+        )
 
-    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
 
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-        },
-    ]
-
-    context = LLMContext(messages)
-    context_aggregator = LLMContextAggregatorPair(context)
-
-    pipeline = Pipeline(
-        [
-            transport.input(),  # Transport user input
-            stt,  # STT
-            context_aggregator.user(),  # User responses
-            llm,  # LLM
-            tts,  # TTS
-            transport.output(),  # Transport bot output
-            context_aggregator.assistant(),  # Assistant spoken responses
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
         ]
-    )
-
-    task = PipelineTask(
-        pipeline,
-        params=PipelineParams(
-            enable_metrics=True,
-            enable_usage_metrics=True,
-        ),
-        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
-    )
-
-    @transport.event_handler("on_client_connected")
-    async def on_client_connected(transport, client):
-        logger.info(f"Client connected")
-        # Kick off the conversation.
-        messages.append({"role": "system", "content": "Please introduce yourself to the user."})
-        await task.queue_frames([LLMRunFrame()])
-
-    @transport.event_handler("on_client_disconnected")
-    async def on_client_disconnected(transport, client):
-        logger.info(f"Client disconnected")
-        await task.cancel()
-
-    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
-
-    await runner.run(task)
+
+        context = LLMContext(messages)
+        context_aggregator = LLMContextAggregatorPair(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,  # STT
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            params=PipelineParams(
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+            idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+        )
+
+        @transport.event_handler("on_client_connected")
+        async def on_client_connected(transport, client):
+            logger.info(f"Client connected")
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMRunFrame()])
+
+        @transport.event_handler("on_client_disconnected")
+        async def on_client_disconnected(transport, client):
+            logger.info(f"Client disconnected")
+            await task.cancel()
+
+        runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+        await runner.run(task)
 
 
 async def bot(runner_args: RunnerArguments):
diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py
@@ -12,6 +12,7 @@
 
 from typing import AsyncGenerator, Optional
 
+import aiohttp
 from loguru import logger
 
 from pipecat.frames.frames import (
@@ -24,13 +25,6 @@
 from pipecat.services.tts_service import TTSService
 from pipecat.utils.tracing.service_decorators import traced_tts
 
-try:
-    from deepgram import DeepgramClient, DeepgramClientOptions, SpeakOptions
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error("In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`.")
-    raise Exception(f"Missing module: {e}")
-
 
 class DeepgramTTSService(TTSService):
     """Deepgram text-to-speech service.
@@ -45,7 +39,8 @@ def __init__(
         *,
         api_key: str,
         voice: str = "aura-2-helena-en",
-        base_url: str = "",
+        aiohttp_session: aiohttp.ClientSession,
+        base_url: str = "https://api.deepgram.com",
         sample_rate: Optional[int] = None,
         encoding: str = "linear16",
         **kwargs,
@@ -55,21 +50,22 @@ def __init__(
         Args:
             api_key: Deepgram API key for authentication.
             voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
-            base_url: Custom base URL for Deepgram API. Uses default if empty.
+            aiohttp_session: Shared aiohttp session for HTTP requests with connection pooling.
+            base_url: Custom base URL for Deepgram API. Defaults to "https://api.deepgram.com".
             sample_rate: Audio sample rate in Hz. If None, uses service default.
             encoding: Audio encoding format. Defaults to "linear16".
             **kwargs: Additional arguments passed to parent TTSService class.
         """
         super().__init__(sample_rate=sample_rate, **kwargs)
 
+        self._api_key = api_key
+        self._session = aiohttp_session
+        self._base_url = base_url
         self._settings = {
             "encoding": encoding,
         }
         self.set_voice(voice)
 
-        client_options = DeepgramClientOptions(url=base_url)
-        self._deepgram_client = DeepgramClient(api_key, config=client_options)
-
     def can_generate_metrics(self) -> bool:
         """Check if the service can generate metrics.
 
@@ -90,27 +86,49 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         """
         logger.debug(f"{self}: Generating TTS [{text}]")
 
-        options = SpeakOptions(
-            model=self._voice_id,
-            encoding=self._settings["encoding"],
-            sample_rate=self.sample_rate,
-            container="none",
-        )
+        # Build URL with parameters
+        url = f"{self._base_url}/v1/speak"
 
-        try:
-            await self.start_ttfb_metrics()
+        headers = {"Authorization": f"Token {self._api_key}", "Content-Type": "application/json"}
 
-            response = await self._deepgram_client.speak.asyncrest.v("1").stream_raw(
-                {"text": text}, options
-            )
+        params = {
+            "model": self._voice_id,
+            "encoding": self._settings["encoding"],
+            "sample_rate": self.sample_rate,
+            "container": "none",
+        }
 
-            await self.start_tts_usage_metrics(text)
-            yield TTSStartedFrame()
+        payload = {
+            "text": text,
+        }
+
+        try:
+            await self.start_ttfb_metrics()
 
-            async for data in response.aiter_bytes():
-                await self.stop_ttfb_metrics()
-                if data:
-                    yield TTSAudioRawFrame(audio=data, sample_rate=self.sample_rate, num_channels=1)
+            async with self._session.post(
+                url, headers=headers, json=payload, params=params
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise Exception(f"HTTP {response.status}: {error_text}")
+
+                await self.start_tts_usage_metrics(text)
+                yield TTSStartedFrame()
+
+                CHUNK_SIZE = self.chunk_size
+
+                first_chunk = True
+                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
+                    if first_chunk:
+                        await self.stop_ttfb_metrics()
+                        first_chunk = False
+
+                    if chunk:
+                        yield TTSAudioRawFrame(
+                            audio=chunk,
+                            sample_rate=self.sample_rate,
+                            num_channels=1,
+                        )
 
             yield TTSStoppedFrame()