Add DeepgramHttpTTSService

markbackman · markbackman · commit 7eb880c5e8c9 · 2025-10-31T11:39:32.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added a new `DeepgramHttpTTSService`, which delivers a meaningful reduction
+  in latency when compared to the `DeepgramTTSService`.
+
 - Add support for `speaking_rate` input parameter in `GoogleHttpTTSService`.
 
 - Added `enable_speaker_diarization` and `enable_language_identification` to
diff --git a/examples/foundational/07c-interruptible-deepgram-http.py b/examples/foundational/07c-interruptible-deepgram-http.py
@@ -0,0 +1,132 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+
+import os
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
+from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import LLMRunFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.deepgram.tts import DeepgramHttpTTSService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.daily.transport import DailyParams
+from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
+
+load_dotenv(override=True)
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+        turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+        turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+        turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    async with aiohttp.ClientSession() as session:
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+        tts = DeepgramHttpTTSService(
+            api_key=os.getenv("DEEPGRAM_API_KEY"),
+            voice="aura-2-andromeda-en",
+            aiohttp_session=session,
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = LLMContext(messages)
+        context_aggregator = LLMContextAggregatorPair(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,  # STT
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            params=PipelineParams(
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+            idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+        )
+
+        @transport.event_handler("on_client_connected")
+        async def on_client_connected(transport, client):
+            logger.info(f"Client connected")
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMRunFrame()])
+
+        @transport.event_handler("on_client_disconnected")
+        async def on_client_disconnected(transport, client):
+            logger.info(f"Client disconnected")
+            await task.cancel()
+
+        runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+        await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    transport = await create_transport(runner_args, transport_params)
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py
@@ -87,6 +87,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
     ("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
     ("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
     ("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
+    ("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
     ("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
     ("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
     ("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),
diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py
@@ -12,6 +12,7 @@
 
 from typing import AsyncGenerator, Optional
 
+import aiohttp
 from loguru import logger
 
 from pipecat.frames.frames import (
@@ -117,3 +118,114 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         except Exception as e:
             logger.exception(f"{self} exception: {e}")
             yield ErrorFrame(f"Error getting audio: {str(e)}")
+
+
+class DeepgramHttpTTSService(TTSService):
+    """Deepgram HTTP text-to-speech service.
+
+    Provides text-to-speech synthesis using Deepgram's HTTP TTS API.
+    Supports various voice models and audio encoding formats with
+    configurable sample rates and quality settings.
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        voice: str = "aura-2-helena-en",
+        aiohttp_session: aiohttp.ClientSession,
+        base_url: str = "https://api.deepgram.com",
+        sample_rate: Optional[int] = None,
+        encoding: str = "linear16",
+        **kwargs,
+    ):
+        """Initialize the Deepgram TTS service.
+
+        Args:
+            api_key: Deepgram API key for authentication.
+            voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
+            aiohttp_session: Shared aiohttp session for HTTP requests with connection pooling.
+            base_url: Custom base URL for Deepgram API. Defaults to "https://api.deepgram.com".
+            sample_rate: Audio sample rate in Hz. If None, uses service default.
+            encoding: Audio encoding format. Defaults to "linear16".
+            **kwargs: Additional arguments passed to parent TTSService class.
+        """
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        self._api_key = api_key
+        self._session = aiohttp_session
+        self._base_url = base_url
+        self._settings = {
+            "encoding": encoding,
+        }
+        self.set_voice(voice)
+
+    def can_generate_metrics(self) -> bool:
+        """Check if the service can generate metrics.
+
+        Returns:
+            True, as Deepgram TTS service supports metrics generation.
+        """
+        return True
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Deepgram's TTS API.
+
+        Args:
+            text: The text to synthesize into speech.
+
+        Yields:
+            Frame: Audio frames containing the synthesized speech, plus start/stop frames.
+        """
+        logger.debug(f"{self}: Generating TTS [{text}]")
+
+        # Build URL with parameters
+        url = f"{self._base_url}/v1/speak"
+
+        headers = {"Authorization": f"Token {self._api_key}", "Content-Type": "application/json"}
+
+        params = {
+            "model": self._voice_id,
+            "encoding": self._settings["encoding"],
+            "sample_rate": self.sample_rate,
+            "container": "none",
+        }
+
+        payload = {
+            "text": text,
+        }
+
+        try:
+            await self.start_ttfb_metrics()
+
+            async with self._session.post(
+                url, headers=headers, json=payload, params=params
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise Exception(f"HTTP {response.status}: {error_text}")
+
+                await self.start_tts_usage_metrics(text)
+                yield TTSStartedFrame()
+
+                CHUNK_SIZE = self.chunk_size
+
+                first_chunk = True
+                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
+                    if first_chunk:
+                        await self.stop_ttfb_metrics()
+                        first_chunk = False
+
+                    if chunk:
+                        yield TTSAudioRawFrame(
+                            audio=chunk,
+                            sample_rate=self.sample_rate,
+                            num_channels=1,
+                        )
+
+            yield TTSStoppedFrame()
+
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+            yield ErrorFrame(f"Error getting audio: {str(e)}")