From 60b22b3ae4595feae025f5650f33ccea7c0f7199 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Wed, 5 Nov 2025 18:21:46 +0800
Subject: [PATCH 01/14] support volcano TTS tools

---
 .gitignore                            |   5 +-
 tests/tools/builtin_tools/test_tts.py | 127 +++++++++++++++
 veadk/tools/builtin_tools/tts.py      | 221 ++++++++++++++++++++++++++
 veadk/utils/audio_manager.py          |  54 +++++++
 4 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 tests/tools/builtin_tools/test_tts.py
 create mode 100644 veadk/tools/builtin_tools/tts.py
 create mode 100644 veadk/utils/audio_manager.py

diff --git a/.gitignore b/.gitignore
index 8efbddc4..70d099ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -196,4 +196,7 @@ cython_debug/
 
 **/.nuxt
 **/.data
-**./output
\ No newline at end of file
+**./output
+
+*.mp3
+*.pcm
\ No newline at end of file
diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
new file mode 100644
index 00000000..0922dea3
--- /dev/null
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import queue
+import json
+import base64
+import requests
+from unittest import TestCase, mock
+from unittest.mock import patch, MagicMock
+from google.adk.tools import ToolContext
+from veadk.tools.builtin_tools.tts import tts, handle_server_response, save_output_to_file, _audio_player_thread
+
+
+class TestTTS(TestCase):
+    def setUp(self):
+        self.mock_tool_context = MagicMock(spec=ToolContext)
+        self.mock_tool_context._invocation_context = MagicMock()
+        self.mock_tool_context._invocation_context.user_id = "test_user"
+
+        # Mock environment variables
+        self.patcher_env = patch.dict('os.environ', {
+            'TOOL_TTS_APP_ID': 'test_app_id',
+            'TOOL_TTS_API_KEY': 'test_api_key'
+        })
+        self.patcher_env.start()
+
+    def tearDown(self):
+        self.patcher_env.stop()
+
+    @patch('requests.Session')
+    def test_tts_success(self, mock_session):
+        """Test successful TTS request"""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.headers = {'X-Tt-Logid': 'test_log_id'}
+        mock_response.iter_lines.return_value = [
+            json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
+            json.dumps({"code": 20000000})
+        ]
+        mock_session.return_value.post.return_value = mock_response
+
+        # Call function
+        result = tts("test text", self.mock_tool_context)
+
+        # Assertions
+        self.assertTrue(result)
+        mock_session.return_value.post.assert_called_once()
+        mock_response.close.assert_called_once()
+
+    @patch('requests.Session')
+    def test_tts_failure(self, mock_session):
+        """Test TTS request failure"""
+        # Setup mock to raise exception
+        mock_session.return_value.post.side_effect = requests.exceptions.RequestException("Test error")
+
+        # Call function
+        result = tts("test text", self.mock_tool_context)
+
+        # Assertions
+        self.assertFalse(result)  # Still returns True despite error
+        mock_session.return_value.post.assert_called_once()
+
+    @patch('builtins.open')
+    @patch('pyaudio.PyAudio')
+    def test_handle_server_response_success(self, mock_pyaudio, mock_open):
+        """Test successful response handling"""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.iter_lines.return_value = [
+            json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
+            json.dumps({"code": 20000000})
+        ]
+
+        # Setup mock audio stream
+        mock_stream = MagicMock()
+        mock_pyaudio.return_value.open.return_value = mock_stream
+
+        # Call function
+        handle_server_response(mock_response, "test.pcm")
+
+        # Assertions
+        mock_stream.write.assert_called_with(b"audio_chunk")
+        mock_open.assert_called_once_with("test.pcm", 'wb')
+
+    @patch('builtins.open')
+    def test_save_output_to_file_success(self, mock_open):
+        """Test successful audio file save"""
+        # Setup mock file handler
+        mock_file = MagicMock()
+        mock_open.return_value.__enter__.return_value = mock_file
+
+        # Call function
+        save_output_to_file(b"audio_data", "test.pcm")
+
+        # Assertions
+        mock_open.assert_called_once_with("test.pcm", 'wb')
+        mock_file.write.assert_called_once_with(b"audio_data")
+
+
+    @patch('time.sleep')
+    def test_audio_player_thread(self, mock_sleep):
+        """Test audio player thread"""
+        # Setup test data
+        mock_queue = MagicMock()
+        mock_queue.get.side_effect = [b"audio_data", queue.Empty]
+        mock_stream = MagicMock()
+        stop_event = MagicMock()
+        stop_event.is_set.side_effect = [False, True]
+
+        # Call function
+        _audio_player_thread(mock_queue, mock_stream, stop_event)
+
+        # Assertions
+        mock_stream.write.assert_called_once_with(b"audio_data")
+        mock_queue.task_done.assert_called_once()
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
new file mode 100644
index 00000000..ef677ef3
--- /dev/null
+++ b/veadk/tools/builtin_tools/tts.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import json
+import base64
+import time
+import queue
+import pyaudio
+import threading
+from google.adk.tools import ToolContext
+from veadk.config import getenv, settings
+from veadk.utils.logger import get_logger
+from veadk.utils.audio_manager import AudioDeviceManager, AudioConfig
+
+logger = get_logger(__name__)
+
+input_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 16000,
+    "bit_size": pyaudio.paInt16
+}
+
+output_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 24000,
+    "bit_size": pyaudio.paInt16
+}
+
+
+def tts(text: str, tool_context: ToolContext) -> bool:
+    """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
+        Use this tool when you need to convert text content into audible speech.
+        It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
+        selection (e.g., male/female/neutral), speech speed and volume adjustment, as well as exporting the generated
+        audio in common formats (e.g., MP3, WAV).
+
+        Args:
+            text: The text to convert.
+
+        Returns:
+            True if the TTS conversion is successful, False otherwise.
+        """
+    url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
+    audio_save_path = "tts.pcm"
+    success = True
+
+    app_id = getenv("TOOL_TTS_APP_ID")
+    api_key = getenv("TOOL_TTS_API_KEY")
+    speaker = getenv("TOOL_TTS_SPEAKER")  # e.g. zh_female_vv_mars_bigtts
+    headers = {
+        "X-Api-App-Id": app_id,
+        "X-Api-Access-Key": api_key,
+        "X-Api-Resource-Id": "seed-tts-1.0",  # seed-tts-1.0 or seed-tts-2.0
+        "Content-Type": "application/json",
+        "Connection": "keep-alive"
+    }
+    payload = {
+        "user": {
+            "uid": tool_context._invocation_context.user_id
+        },
+        "req_params": {
+            "text": text,
+            "speaker": speaker,
+            "audio_params": {
+                "format": "pcm",
+                "bit_rate": 16000,
+                "sample_rate": 24000,
+                "enable_timestamp": True
+            },
+            "additions": "{\"explicit_language\":\"zh\",\"disable_markdown_filter\":true, \"enable_timestamp\":true}\"}"
+        }
+    }
+
+    session = requests.Session()
+    response = None
+
+    try:
+        logger.debug(f"Request TTS server with payload: {payload}.")
+        response = session.post(url, headers=headers, json=payload, stream=True)
+        log_id = response.headers.get('X-Tt-Logid')
+        logger.debug(f"Response from TTS server with logid: {log_id}, and response body {response}")
+        handle_server_response(response, audio_save_path)
+
+    except Exception as e:
+        logger.debug(f"Failed to convert text to speech: {e}")
+        success = False
+    finally:
+        if response:
+            response.close()
+        session.close()
+    return success
+
+
+def handle_server_response(response: requests.models.Response, audio_save_path: str) -> None:
+    """
+    Handle the server response for TTS.
+
+    Args:
+        response: The server response as a dictionary.
+
+    Returns:
+        None
+    """
+
+    # audio data buffer
+    audio_data = bytearray()
+    # audio data queue for player thread
+    audio_queue = queue.Queue()
+    total_audio_size = 0
+
+    audio_device = AudioDeviceManager(
+        AudioConfig(**input_audio_config),
+        AudioConfig(**output_audio_config)
+    )
+
+    # init output stream
+    output_stream = audio_device.open_output_stream()
+    stop_event = threading.Event()
+    player_thread = threading.Thread(target=_audio_player_thread, args=(audio_queue, output_stream, stop_event))
+    player_thread.daemon = True
+    player_thread.start()
+
+    try:
+        for chunk in response.iter_lines(decode_unicode=True):
+            if not chunk:
+                continue
+            data = json.loads(chunk)
+
+            if data.get("code", 0) == 0 and "data" in data and data["data"]:
+                chunk_audio = base64.b64decode(data["data"])
+                audio_size = len(chunk_audio)
+                total_audio_size += audio_size
+                audio_queue.put(chunk_audio)
+                audio_data.extend(chunk_audio)
+                continue
+            if data.get("code", 0) == 0 and "sentence" in data and data["sentence"]:
+                logger.debug(f"sentence_data: {data}")
+                continue
+            if data.get("code", 0) == 20000000:
+                logger.debug(f"successfully get audio data, total size: {total_audio_size / 1024:.2f} KB")
+                break
+            if data.get("code", 0) > 0:
+                logger.debug(f"error response:{data}")
+                break
+
+        # save audio data to file
+        save_output_to_file(audio_data, audio_save_path)
+    except Exception as e:
+        logger.error(f"handle tts failed: {e}, response: {response}")
+    finally:
+        audio_queue.join()
+        stop_event.set()
+        player_thread.join()
+        output_stream.close()
+
+
+def _audio_player_thread(audio_queue, output_stream, stop_event):
+    """
+    Play audio data from queue.
+    Args:
+        audio_queue: The queue to store audio data.
+        output_stream: The output stream to play audio.
+        stop_event: The event to stop the thread.
+
+    Returns:
+
+    """
+    while not stop_event.is_set():
+        try:
+            # write audio data to output stream
+            audio_data = audio_queue.get(timeout=1.0)
+            if audio_data:
+                output_stream.write(audio_data)
+            audio_queue.task_done()
+        except queue.Empty:
+            # if queue is empty, sleep for a while
+            time.sleep(0.1)
+        except Exception as e:
+            logger.debug(f"Failed to play audio data: {e}")
+            time.sleep(0.1)
+    logger.debug("audio player thread exited")
+
+
+def save_output_to_file(audio_data: bytearray, filename: str):
+    """
+    Save audio data to file.
+
+    Args:
+        audio_data: The audio data as bytes.
+        filename: The filename to save the audio data.
+
+    Returns:
+        None
+    """
+
+    if not audio_data:
+        logger.debug("No audio data to save.")
+        return
+    try:
+        with open(filename, 'wb') as f:
+            f.write(audio_data)
+            logger.debug(
+                f"Successfully save audio file to {filename},file size: {len(audio_data) / 1024:.2f} KB")
+    except IOError as e:
+        logger.debug(f"Failed to save pcm file: {e}")
diff --git a/veadk/utils/audio_manager.py b/veadk/utils/audio_manager.py
new file mode 100644
index 00000000..6f596b97
--- /dev/null
+++ b/veadk/utils/audio_manager.py
@@ -0,0 +1,54 @@
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+
+import pyaudio
+
+
+@dataclass
+class AudioConfig:
+    """audio config"""
+    format: str
+    bit_size: int
+    channels: int
+    sample_rate: int
+    chunk: int
+
+
+class AudioDeviceManager:
+    """audio device manager, handle audio input/output"""
+
+    def __init__(self, input_config: AudioConfig, output_config: AudioConfig):
+        self.input_config = input_config
+        self.output_config = output_config
+        self.pyaudio = pyaudio.PyAudio()
+        self.input_stream: Optional[pyaudio.Stream] = None
+        self.output_stream: Optional[pyaudio.Stream] = None
+
+    def open_input_stream(self) -> pyaudio.Stream:
+        # p = pyaudio.PyAudio()
+        self.input_stream = self.pyaudio.open(
+            format=self.input_config.bit_size,
+            channels=self.input_config.channels,
+            rate=self.input_config.sample_rate,
+            input=True,
+            frames_per_buffer=self.input_config.chunk
+        )
+        return self.input_stream
+
+    def open_output_stream(self) -> pyaudio.Stream:
+        self.output_stream = self.pyaudio.open(
+            format=self.output_config.bit_size,
+            channels=self.output_config.channels,
+            rate=self.output_config.sample_rate,
+            output=True,
+            frames_per_buffer=self.output_config.chunk
+        )
+        return self.output_stream
+
+    def cleanup(self) -> None:
+        for stream in [self.input_stream, self.output_stream]:
+            if stream:
+                stream.stop_stream()
+                stream.close()
+        self.pyaudio.terminate()
+

From 1314c06356a0c83572c8fa987e0dad8e529a8052 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Thu, 6 Nov 2025 15:33:25 +0800
Subject: [PATCH 02/14] update pyaudio dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 01d51cdc..e97f8dd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "pymysql>=1.1.1", # For MySQL database (short term memory)
     "opensearch-py==2.8.0",
     "filetype>=1.2.0",
+    "pyaudio>=0.2.14",
 ]
 
 [project.scripts]

From fe59d1a0aeff5e6c002d1158f76670d6b99dd419 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Thu, 6 Nov 2025 16:01:51 +0800
Subject: [PATCH 03/14] code lint

---
 tests/tools/builtin_tools/test_tts.py | 49 ++++++++++--------
 veadk/tools/builtin_tools/tts.py      | 74 ++++++++++++++++-----------
 2 files changed, 74 insertions(+), 49 deletions(-)

diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
index 0922dea3..c8b9066b 100644
--- a/tests/tools/builtin_tools/test_tts.py
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import queue
 import json
 import base64
 import requests
-from unittest import TestCase, mock
+from unittest import TestCase
 from unittest.mock import patch, MagicMock
 from google.adk.tools import ToolContext
-from veadk.tools.builtin_tools.tts import tts, handle_server_response, save_output_to_file, _audio_player_thread
+from veadk.tools.builtin_tools.tts import (
+    tts,
+    handle_server_response,
+    save_output_to_file,
+    _audio_player_thread,
+)
 
 
 class TestTTS(TestCase):
@@ -30,24 +34,28 @@ def setUp(self):
         self.mock_tool_context._invocation_context.user_id = "test_user"
 
         # Mock environment variables
-        self.patcher_env = patch.dict('os.environ', {
-            'TOOL_TTS_APP_ID': 'test_app_id',
-            'TOOL_TTS_API_KEY': 'test_api_key'
-        })
+        self.patcher_env = patch.dict(
+            "os.environ",
+            {
+                "TOOL_TTS_APP_ID": "test_app_id",
+                "TOOL_TTS_API_KEY": "test_api_key",
+                "TOOL_TTS_SPEAKER": "test_speaker",
+            },
+        )
         self.patcher_env.start()
 
     def tearDown(self):
         self.patcher_env.stop()
 
-    @patch('requests.Session')
+    @patch("requests.Session")
     def test_tts_success(self, mock_session):
         """Test successful TTS request"""
         # Setup mock response
         mock_response = MagicMock()
-        mock_response.headers = {'X-Tt-Logid': 'test_log_id'}
+        mock_response.headers = {"X-Tt-Logid": "test_log_id"}
         mock_response.iter_lines.return_value = [
             json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
-            json.dumps({"code": 20000000})
+            json.dumps({"code": 20000000}),
         ]
         mock_session.return_value.post.return_value = mock_response
 
@@ -59,11 +67,13 @@ def test_tts_success(self, mock_session):
         mock_session.return_value.post.assert_called_once()
         mock_response.close.assert_called_once()
 
-    @patch('requests.Session')
+    @patch("requests.Session")
     def test_tts_failure(self, mock_session):
         """Test TTS request failure"""
         # Setup mock to raise exception
-        mock_session.return_value.post.side_effect = requests.exceptions.RequestException("Test error")
+        mock_session.return_value.post.side_effect = (
+            requests.exceptions.RequestException("Test error")
+        )
 
         # Call function
         result = tts("test text", self.mock_tool_context)
@@ -72,15 +82,15 @@ def test_tts_failure(self, mock_session):
         self.assertFalse(result)  # Still returns True despite error
         mock_session.return_value.post.assert_called_once()
 
-    @patch('builtins.open')
-    @patch('pyaudio.PyAudio')
+    @patch("builtins.open")
+    @patch("pyaudio.PyAudio")
     def test_handle_server_response_success(self, mock_pyaudio, mock_open):
         """Test successful response handling"""
         # Setup mock response
         mock_response = MagicMock()
         mock_response.iter_lines.return_value = [
             json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
-            json.dumps({"code": 20000000})
+            json.dumps({"code": 20000000}),
         ]
 
         # Setup mock audio stream
@@ -92,9 +102,9 @@ def test_handle_server_response_success(self, mock_pyaudio, mock_open):
 
         # Assertions
         mock_stream.write.assert_called_with(b"audio_chunk")
-        mock_open.assert_called_once_with("test.pcm", 'wb')
+        mock_open.assert_called_once_with("test.pcm", "wb")
 
-    @patch('builtins.open')
+    @patch("builtins.open")
     def test_save_output_to_file_success(self, mock_open):
         """Test successful audio file save"""
         # Setup mock file handler
@@ -105,11 +115,10 @@ def test_save_output_to_file_success(self, mock_open):
         save_output_to_file(b"audio_data", "test.pcm")
 
         # Assertions
-        mock_open.assert_called_once_with("test.pcm", 'wb')
+        mock_open.assert_called_once_with("test.pcm", "wb")
         mock_file.write.assert_called_once_with(b"audio_data")
 
-
-    @patch('time.sleep')
+    @patch("time.sleep")
     def test_audio_player_thread(self, mock_sleep):
         """Test audio player thread"""
         # Setup test data
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index ef677ef3..82a8870c 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -19,8 +19,9 @@
 import queue
 import pyaudio
 import threading
+import tempfile
 from google.adk.tools import ToolContext
-from veadk.config import getenv, settings
+from veadk.config import getenv
 from veadk.utils.logger import get_logger
 from veadk.utils.audio_manager import AudioDeviceManager, AudioConfig
 
@@ -31,7 +32,7 @@
     "format": "pcm",
     "channels": 1,
     "sample_rate": 16000,
-    "bit_size": pyaudio.paInt16
+    "bit_size": pyaudio.paInt16,
 }
 
 output_audio_config = {
@@ -39,25 +40,25 @@
     "format": "pcm",
     "channels": 1,
     "sample_rate": 24000,
-    "bit_size": pyaudio.paInt16
+    "bit_size": pyaudio.paInt16,
 }
 
 
 def tts(text: str, tool_context: ToolContext) -> bool:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
-        Use this tool when you need to convert text content into audible speech.
-        It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
-        selection (e.g., male/female/neutral), speech speed and volume adjustment, as well as exporting the generated
-        audio in common formats (e.g., MP3, WAV).
+    Use this tool when you need to convert text content into audible speech.
+    It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
+    selection (e.g., male/female/neutral), speech speed and volume adjustment, as well as exporting the generated
+    audio in common formats (e.g., MP3, WAV).
 
-        Args:
-            text: The text to convert.
+    Args:
+        text: The text to convert.
 
-        Returns:
-            True if the TTS conversion is successful, False otherwise.
-        """
+    Returns:
+        True if the TTS conversion is successful, False otherwise.
+    """
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
-    audio_save_path = "tts.pcm"
+    audio_save_path = ""
     success = True
 
     app_id = getenv("TOOL_TTS_APP_ID")
@@ -68,12 +69,10 @@ def tts(text: str, tool_context: ToolContext) -> bool:
         "X-Api-Access-Key": api_key,
         "X-Api-Resource-Id": "seed-tts-1.0",  # seed-tts-1.0 or seed-tts-2.0
         "Content-Type": "application/json",
-        "Connection": "keep-alive"
+        "Connection": "keep-alive",
     }
     payload = {
-        "user": {
-            "uid": tool_context._invocation_context.user_id
-        },
+        "user": {"uid": tool_context._invocation_context.user_id},
         "req_params": {
             "text": text,
             "speaker": speaker,
@@ -81,10 +80,10 @@ def tts(text: str, tool_context: ToolContext) -> bool:
                 "format": "pcm",
                 "bit_rate": 16000,
                 "sample_rate": 24000,
-                "enable_timestamp": True
+                "enable_timestamp": True,
             },
-            "additions": "{\"explicit_language\":\"zh\",\"disable_markdown_filter\":true, \"enable_timestamp\":true}\"}"
-        }
+            "additions": '{"explicit_language":"zh","disable_markdown_filter":true, "enable_timestamp":true}"}',
+        },
     }
 
     session = requests.Session()
@@ -93,8 +92,15 @@ def tts(text: str, tool_context: ToolContext) -> bool:
     try:
         logger.debug(f"Request TTS server with payload: {payload}.")
         response = session.post(url, headers=headers, json=payload, stream=True)
-        log_id = response.headers.get('X-Tt-Logid')
-        logger.debug(f"Response from TTS server with logid: {log_id}, and response body {response}")
+        log_id = response.headers.get("X-Tt-Logid")
+        logger.debug(
+            f"Response from TTS server with logid: {log_id}, and response body {response}"
+        )
+
+        with tempfile.NamedTemporaryFile(
+            suffix=".pcm", delete=False, dir=tempfile.gettempdir()
+        ) as tmp:
+            audio_save_path = tmp.name  # e.g. /tmp/tts_12345.pcm
         handle_server_response(response, audio_save_path)
 
     except Exception as e:
@@ -107,7 +113,9 @@ def tts(text: str, tool_context: ToolContext) -> bool:
     return success
 
 
-def handle_server_response(response: requests.models.Response, audio_save_path: str) -> None:
+def handle_server_response(
+    response: requests.models.Response, audio_save_path: str
+) -> None:
     """
     Handle the server response for TTS.
 
@@ -125,14 +133,15 @@ def handle_server_response(response: requests.models.Response, audio_save_path:
     total_audio_size = 0
 
     audio_device = AudioDeviceManager(
-        AudioConfig(**input_audio_config),
-        AudioConfig(**output_audio_config)
+        AudioConfig(**input_audio_config), AudioConfig(**output_audio_config)
     )
 
     # init output stream
     output_stream = audio_device.open_output_stream()
     stop_event = threading.Event()
-    player_thread = threading.Thread(target=_audio_player_thread, args=(audio_queue, output_stream, stop_event))
+    player_thread = threading.Thread(
+        target=_audio_player_thread, args=(audio_queue, output_stream, stop_event)
+    )
     player_thread.daemon = True
     player_thread.start()
 
@@ -153,7 +162,9 @@ def handle_server_response(response: requests.models.Response, audio_save_path:
                 logger.debug(f"sentence_data: {data}")
                 continue
             if data.get("code", 0) == 20000000:
-                logger.debug(f"successfully get audio data, total size: {total_audio_size / 1024:.2f} KB")
+                logger.debug(
+                    f"successfully get audio data, total size: {total_audio_size / 1024:.2f} KB"
+                )
                 break
             if data.get("code", 0) > 0:
                 logger.debug(f"error response:{data}")
@@ -212,10 +223,15 @@ def save_output_to_file(audio_data: bytearray, filename: str):
     if not audio_data:
         logger.debug("No audio data to save.")
         return
+    if not filename:
+        logger.debug("No filename to save audio data.")
+        return
+
     try:
-        with open(filename, 'wb') as f:
+        with open(filename, "wb") as f:
             f.write(audio_data)
             logger.debug(
-                f"Successfully save audio file to {filename},file size: {len(audio_data) / 1024:.2f} KB")
+                f"Successfully save audio file to {filename},file size: {len(audio_data) / 1024:.2f} KB"
+            )
     except IOError as e:
         logger.debug(f"Failed to save pcm file: {e}")

From b9b3245fff1b9a099dd698824f03697e75e7ad8c Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Thu, 6 Nov 2025 16:40:15 +0800
Subject: [PATCH 04/14] code lint

---
 veadk/tools/builtin_tools/tts.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index 82a8870c..a31b385a 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -64,6 +64,11 @@ def tts(text: str, tool_context: ToolContext) -> bool:
     app_id = getenv("TOOL_TTS_APP_ID")
     api_key = getenv("TOOL_TTS_API_KEY")
     speaker = getenv("TOOL_TTS_SPEAKER")  # e.g. zh_female_vv_mars_bigtts
+    if not all([app_id, api_key, speaker]):
+        raise ValueError(
+            "Missing required env vars: TOOL_TTS_APP_ID, TOOL_TTS_API_KEY, TOOL_TTS_SPEAKER"
+        )
+
     headers = {
         "X-Api-App-Id": app_id,
         "X-Api-Access-Key": api_key,
@@ -71,6 +76,11 @@ def tts(text: str, tool_context: ToolContext) -> bool:
         "Content-Type": "application/json",
         "Connection": "keep-alive",
     }
+    additions = {
+        "explicit_language": "zh",
+        "disable_markdown_filter": True,
+        "enable_timestamp": True,
+    }
     payload = {
         "user": {"uid": tool_context._invocation_context.user_id},
         "req_params": {
@@ -82,7 +92,7 @@ def tts(text: str, tool_context: ToolContext) -> bool:
                 "sample_rate": 24000,
                 "enable_timestamp": True,
             },
-            "additions": '{"explicit_language":"zh","disable_markdown_filter":true, "enable_timestamp":true}"}',
+            "additions": json.dumps(additions),
         },
     }
 
@@ -104,7 +114,7 @@ def tts(text: str, tool_context: ToolContext) -> bool:
         handle_server_response(response, audio_save_path)
 
     except Exception as e:
-        logger.debug(f"Failed to convert text to speech: {e}")
+        logger.error(f"Failed to convert text to speech: {e}")
         success = False
     finally:
         if response:
@@ -120,7 +130,7 @@ def handle_server_response(
     Handle the server response for TTS.
 
     Args:
-        response: The server response as a dictionary.
+        response: The server response as a requests.models.Response object.
 
     Returns:
         None
@@ -167,13 +177,14 @@ def handle_server_response(
                 )
                 break
             if data.get("code", 0) > 0:
-                logger.debug(f"error response:{data}")
+                logger.debug(f"tts response error:{data}")
                 break
 
         # save audio data to file
         save_output_to_file(audio_data, audio_save_path)
     except Exception as e:
         logger.error(f"handle tts failed: {e}, response: {response}")
+        raise
     finally:
         audio_queue.join()
         stop_event.set()
@@ -203,12 +214,12 @@ def _audio_player_thread(audio_queue, output_stream, stop_event):
             # if queue is empty, sleep for a while
             time.sleep(0.1)
         except Exception as e:
-            logger.debug(f"Failed to play audio data: {e}")
+            logger.error(f"Failed to play audio data: {e}")
             time.sleep(0.1)
     logger.debug("audio player thread exited")
 
 
-def save_output_to_file(audio_data: bytearray, filename: str):
+def save_output_to_file(audio_data: bytearray, filename: str) -> None:
     """
     Save audio data to file.
 
@@ -234,4 +245,4 @@ def save_output_to_file(audio_data: bytearray, filename: str):
                 f"Successfully save audio file to {filename},file size: {len(audio_data) / 1024:.2f} KB"
             )
     except IOError as e:
-        logger.debug(f"Failed to save pcm file: {e}")
+        logger.error(f"Failed to save pcm file: {e}")

From f8e65c1543818a8c71de3110260487ac601020db Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Thu, 6 Nov 2025 23:44:45 +0800
Subject: [PATCH 05/14] fix(tts): improve functionality and reliability of
 text-to-speech conversion

---
 tests/tools/builtin_tools/test_tts.py | 10 +++++-----
 veadk/tools/builtin_tools/tts.py      | 16 ++++++++++------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
index c8b9066b..61a954e0 100644
--- a/tests/tools/builtin_tools/test_tts.py
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -20,7 +20,7 @@
 from unittest.mock import patch, MagicMock
 from google.adk.tools import ToolContext
 from veadk.tools.builtin_tools.tts import (
-    tts,
+    text_to_speech,
     handle_server_response,
     save_output_to_file,
     _audio_player_thread,
@@ -60,10 +60,10 @@ def test_tts_success(self, mock_session):
         mock_session.return_value.post.return_value = mock_response
 
         # Call function
-        result = tts("test text", self.mock_tool_context)
+        result = text_to_speech("test text", self.mock_tool_context)
 
         # Assertions
-        self.assertTrue(result)
+        self.assertEqual("test text", result)  # Still returns True despite error
         mock_session.return_value.post.assert_called_once()
         mock_response.close.assert_called_once()
 
@@ -76,10 +76,10 @@ def test_tts_failure(self, mock_session):
         )
 
         # Call function
-        result = tts("test text", self.mock_tool_context)
+        result = text_to_speech("test text", self.mock_tool_context)
 
         # Assertions
-        self.assertFalse(result)  # Still returns True despite error
+        self.assertEqual("test text", result)  # Still returns True despite error
         mock_session.return_value.post.assert_called_once()
 
     @patch("builtins.open")
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index a31b385a..d41c58d3 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import requests
 import json
 import base64
@@ -44,7 +45,7 @@
 }
 
 
-def tts(text: str, tool_context: ToolContext) -> bool:
+def text_to_speech(text: str, tool_context: ToolContext) -> str:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
     Use this tool when you need to convert text content into audible speech.
     It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
@@ -55,15 +56,16 @@ def tts(text: str, tool_context: ToolContext) -> bool:
         text: The text to convert.
 
     Returns:
-        True if the TTS conversion is successful, False otherwise.
+        The original text.
     """
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
     audio_save_path = ""
-    success = True
 
     app_id = getenv("TOOL_TTS_APP_ID")
     api_key = getenv("TOOL_TTS_API_KEY")
-    speaker = getenv("TOOL_TTS_SPEAKER")  # e.g. zh_female_vv_mars_bigtts
+    speaker = getenv(
+        "TOOL_TTS_SPEAKER", "zh_female_vv_mars_bigtts"
+    )  # e.g. zh_female_vv_mars_bigtts
     if not all([app_id, api_key, speaker]):
         raise ValueError(
             "Missing required env vars: TOOL_TTS_APP_ID, TOOL_TTS_API_KEY, TOOL_TTS_SPEAKER"
@@ -115,12 +117,14 @@ def tts(text: str, tool_context: ToolContext) -> bool:
 
     except Exception as e:
         logger.error(f"Failed to convert text to speech: {e}")
-        success = False
     finally:
+        if audio_save_path and os.path.exists(audio_save_path):
+            os.remove(audio_save_path)
         if response:
             response.close()
         session.close()
-    return success
+    logger.debug("Finish convert text to speech")
+    return text
 
 
 def handle_server_response(

From 98139052d797e4cf855075f41c234b0ec85b49fd Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Fri, 7 Nov 2025 13:46:03 +0800
Subject: [PATCH 06/14] fix(tts): improve return structured dict with
 'saved_audio_path'

---
 tests/tools/builtin_tools/test_tts.py |  8 ++++++--
 veadk/tools/builtin_tools/tts.py      | 19 +++++++++++++------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
index 61a954e0..b0521e00 100644
--- a/tests/tools/builtin_tools/test_tts.py
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -63,7 +63,9 @@ def test_tts_success(self, mock_session):
         result = text_to_speech("test text", self.mock_tool_context)
 
         # Assertions
-        self.assertEqual("test text", result)  # Still returns True despite error
+        self.assertIsInstance(result, dict)
+        self.assertIn("text", result)
+        self.assertIn("audio_path", result)
         mock_session.return_value.post.assert_called_once()
         mock_response.close.assert_called_once()
 
@@ -79,7 +81,9 @@ def test_tts_failure(self, mock_session):
         result = text_to_speech("test text", self.mock_tool_context)
 
         # Assertions
-        self.assertEqual("test text", result)  # Still returns True despite error
+        self.assertIsInstance(result, dict)
+        self.assertIn("error", result)
+        self.assertIn("Test error", result["error"])
         mock_session.return_value.post.assert_called_once()
 
     @patch("builtins.open")
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index d41c58d3..0f045458 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -21,6 +21,7 @@
 import pyaudio
 import threading
 import tempfile
+from typing import Dict, Any
 from google.adk.tools import ToolContext
 from veadk.config import getenv
 from veadk.utils.logger import get_logger
@@ -45,7 +46,7 @@
 }
 
 
-def text_to_speech(text: str, tool_context: ToolContext) -> str:
+def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
     Use this tool when you need to convert text content into audible speech.
     It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
@@ -56,7 +57,7 @@ def text_to_speech(text: str, tool_context: ToolContext) -> str:
         text: The text to convert.
 
     Returns:
-        The original text.
+        A dict with the saved audio path.
     """
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
     audio_save_path = ""
@@ -67,9 +68,12 @@ def text_to_speech(text: str, tool_context: ToolContext) -> str:
         "TOOL_TTS_SPEAKER", "zh_female_vv_mars_bigtts"
     )  # e.g. zh_female_vv_mars_bigtts
     if not all([app_id, api_key, speaker]):
-        raise ValueError(
-            "Missing required env vars: TOOL_TTS_APP_ID, TOOL_TTS_API_KEY, TOOL_TTS_SPEAKER"
-        )
+        return {
+            "error": (
+                "Tool text_to_speech execution failed. Missing required env vars: "
+                "TOOL_TTS_APP_ID, TOOL_TTS_API_KEY, TOOL_TTS_SPEAKER"
+            )
+        }
 
     headers = {
         "X-Api-App-Id": app_id,
@@ -117,6 +121,9 @@ def text_to_speech(text: str, tool_context: ToolContext) -> str:
 
     except Exception as e:
         logger.error(f"Failed to convert text to speech: {e}")
+        return {
+            "error": (f"Tool text_to_speech execution failed. Execution Error: {e}")
+        }
     finally:
         if audio_save_path and os.path.exists(audio_save_path):
             os.remove(audio_save_path)
@@ -124,7 +131,7 @@ def text_to_speech(text: str, tool_context: ToolContext) -> str:
             response.close()
         session.close()
     logger.debug("Finish convert text to speech")
-    return text
+    return {"saved_audio_path": audio_save_path}
 
 
 def handle_server_response(

From 07a7712ab6baf2baad669f7f751ed5cf36387553 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Fri, 7 Nov 2025 14:22:03 +0800
Subject: [PATCH 07/14] fix(tts): enhance error message and clarify docstring

---
 veadk/tools/builtin_tools/tts.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index 0f045458..ef0b2c04 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -49,9 +49,7 @@
 def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
     Use this tool when you need to convert text content into audible speech.
-    It transforms plain text into natural-sounding speech, and supports customizations including voice timbre
-    selection (e.g., male/female/neutral), speech speed and volume adjustment, as well as exporting the generated
-    audio in common formats (e.g., MP3, WAV).
+    It transforms plain text into natural-sounding speech, as well as exporting the generated audio in pcm format.
 
     Args:
         text: The text to convert.
@@ -122,7 +120,9 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     except Exception as e:
         logger.error(f"Failed to convert text to speech: {e}")
         return {
-            "error": (f"Tool text_to_speech execution failed. Execution Error: {e}")
+            "error": f"Tool text_to_speech execution failed. "
+            f"Response content: {response}"
+            f"Execution Error: {e}"
         }
     finally:
         if audio_save_path and os.path.exists(audio_save_path):

From 0114b3530a41d25a3357cafa613dc0333cb35457 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Fri, 7 Nov 2025 15:07:01 +0800
Subject: [PATCH 08/14] fix(tts): enhance error message and clarify docstring

---
 veadk/tools/builtin_tools/tts.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index ef0b2c04..9155fa04 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -106,10 +106,6 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     try:
         logger.debug(f"Request TTS server with payload: {payload}.")
         response = session.post(url, headers=headers, json=payload, stream=True)
-        log_id = response.headers.get("X-Tt-Logid")
-        logger.debug(
-            f"Response from TTS server with logid: {log_id}, and response body {response}"
-        )
 
         with tempfile.NamedTemporaryFile(
             suffix=".pcm", delete=False, dir=tempfile.gettempdir()
@@ -118,7 +114,9 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
         handle_server_response(response, audio_save_path)
 
     except Exception as e:
-        logger.error(f"Failed to convert text to speech: {e}")
+        logger.error(
+            f"Failed to convert text to speech: {e}Response content: {response}"
+        )
         return {
             "error": f"Tool text_to_speech execution failed. "
             f"Response content: {response}"

From db0f8047a4e50250d1d4f6b9c9cbbc744f311bc6 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Fri, 7 Nov 2025 17:36:51 +0800
Subject: [PATCH 09/14] auth(veauth): support query tts app_key from openapi

---
 tests/auth/veauth/test_speech_veauth.py |  0
 veadk/auth/veauth/speech_veauth.py      | 71 +++++++++++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 tests/auth/veauth/test_speech_veauth.py
 create mode 100644 veadk/auth/veauth/speech_veauth.py

diff --git a/tests/auth/veauth/test_speech_veauth.py b/tests/auth/veauth/test_speech_veauth.py
new file mode 100644
index 00000000..e69de29b
diff --git a/veadk/auth/veauth/speech_veauth.py b/veadk/auth/veauth/speech_veauth.py
new file mode 100644
index 00000000..30a96dcc
--- /dev/null
+++ b/veadk/auth/veauth/speech_veauth.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from veadk.auth.veauth.utils import get_credential_from_vefaas_iam
+from veadk.utils.logger import get_logger
+from veadk.utils.volcengine_sign import ve_request
+
+logger = get_logger(__name__)
+
+
+def get_tts_token(region: str = "cn-beijing") -> str:
+    logger.info("Fetching TTS token...")
+
+    access_key = os.getenv("VOLCENGINE_ACCESS_KEY")
+    secret_key = os.getenv("VOLCENGINE_SECRET_KEY")
+    session_token = ""
+
+    if not (access_key and secret_key):
+        # try to get from vefaas iam
+        cred = get_credential_from_vefaas_iam()
+        access_key = cred.access_key_id
+        secret_key = cred.secret_access_key
+        session_token = cred.session_token
+
+    res = ve_request(
+        request_body={"ProjectName": "default", "Filter": {}},
+        header={"X-Security-Token": session_token},
+        action="ListApiKeys",
+        ak=access_key,
+        sk=secret_key,
+        service="ark",
+        version="2024-01-01",
+        region=region,
+        host="open.volcengineapi.com",
+    )
+    try:
+        first_api_key_id = res["Result"]["Items"][0]["Id"]
+    except KeyError:
+        raise ValueError(f"Failed to get ARK api key list: {res}")
+
+    # get raw api key
+    res = ve_request(
+        request_body={"Id": first_api_key_id},
+        header={"X-Security-Token": session_token},
+        action="GetRawApiKey",
+        ak=access_key,
+        sk=secret_key,
+        service="ark",
+        version="2024-01-01",
+        region=region,
+        host="open.volcengineapi.com",
+    )
+    try:
+        api_key = res["Result"]["ApiKey"]
+        logger.info("Successfully fetching ARK API Key.")
+        return api_key
+    except KeyError:
+        raise ValueError(f"Failed to get ARK api key: {res}")

From c5a99cfead15275838f2acd6625816c7032fe96e Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Fri, 7 Nov 2025 17:37:34 +0800
Subject: [PATCH 10/14] auth(veauth): support query tts app_key from openapi

---
 tests/auth/veauth/test_speech_veauth.py | 115 ++++++++++++++++++++++++
 veadk/auth/veauth/speech_veauth.py      |  35 ++------
 veadk/tools/builtin_tools/tts.py        |   6 +-
 3 files changed, 127 insertions(+), 29 deletions(-)

diff --git a/tests/auth/veauth/test_speech_veauth.py b/tests/auth/veauth/test_speech_veauth.py
index e69de29b..f4e4908d 100644
--- a/tests/auth/veauth/test_speech_veauth.py
+++ b/tests/auth/veauth/test_speech_veauth.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from unittest.mock import patch, MagicMock
+from veadk.auth.veauth.speech_veauth import get_speech_token
+
+
+# Test cases
+
+
+def test_get_speech_token_with_env_vars(monkeypatch):
+    """Test when credentials are available in environment variables"""
+    # Setup
+    monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
+    monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
+
+    mock_response = {"Result": {"APIKeys": [{"APIKey": "test_api_key"}]}}
+
+    with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
+        mock_ve_request.return_value = mock_response
+
+        # Execute
+        result = get_speech_token()
+
+        # Verify
+        assert result == "test_api_key"
+        mock_ve_request.assert_called_once_with(
+            request_body={
+                "ProjectName": "default",
+                "OnlyAvailable": True,
+                "Filter": {},
+            },
+            header={"X-Security-Token": ""},
+            action="ListApiKeys",
+            ak="test_access_key",
+            sk="test_secret_key",
+            service="speech_saas_prod",
+            version="2025-05-20",
+            region="cn-beijing",
+            host="open.volcengineapi.com",
+        )
+
+
+def test_get_speech_token_with_vefaas_iam(monkeypatch):
+    """Test when credentials are obtained from vefaas iam"""
+    # Setup
+    monkeypatch.delenv("VOLCENGINE_ACCESS_KEY", raising=False)
+    monkeypatch.delenv("VOLCENGINE_SECRET_KEY", raising=False)
+
+    mock_cred = MagicMock()
+    mock_cred.access_key_id = "vefaas_access_key"
+    mock_cred.secret_access_key = "vefaas_secret_key"
+    mock_cred.session_token = "vefaas_session_token"
+
+    mock_response = {"Result": {"APIKeys": [{"APIKey": "vefaas_api_key"}]}}
+
+    with (
+        patch(
+            "veadk.auth.veauth.speech_veauth.get_credential_from_vefaas_iam"
+        ) as mock_get_cred,
+        patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request,
+    ):
+        mock_get_cred.return_value = mock_cred
+        mock_ve_request.return_value = mock_response
+
+        # Execute
+        result = get_speech_token(region="cn-shanghai")
+
+        # Verify
+        assert result == "vefaas_api_key"
+        mock_get_cred.assert_called_once()
+        mock_ve_request.assert_called_once_with(
+            request_body={
+                "ProjectName": "default",
+                "OnlyAvailable": True,
+                "Filter": {},
+            },
+            header={"X-Security-Token": "vefaas_session_token"},
+            action="ListApiKeys",
+            ak="vefaas_access_key",
+            sk="vefaas_secret_key",
+            service="speech_saas_prod",
+            version="2025-05-20",
+            region="cn-shanghai",
+            host="open.volcengineapi.com",
+        )
+
+
+def test_get_speech_token_invalid_response():
+    """Test when API response is invalid"""
+    # Setup
+    monkeypatch = pytest.MonkeyPatch()
+    monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
+    monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
+
+    mock_response = {"Error": {"Message": "Invalid request"}}
+
+    with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
+        mock_ve_request.return_value = mock_response
+
+        # Execute & Verify
+        with pytest.raises(ValueError, match="Failed to get speech api key list"):
+            get_speech_token()
diff --git a/veadk/auth/veauth/speech_veauth.py b/veadk/auth/veauth/speech_veauth.py
index 30a96dcc..5992f601 100644
--- a/veadk/auth/veauth/speech_veauth.py
+++ b/veadk/auth/veauth/speech_veauth.py
@@ -21,8 +21,8 @@
 logger = get_logger(__name__)
 
 
-def get_tts_token(region: str = "cn-beijing") -> str:
-    logger.info("Fetching TTS token...")
+def get_speech_token(region: str = "cn-beijing") -> str:
+    logger.info("Fetching speech token...")
 
     access_key = os.getenv("VOLCENGINE_ACCESS_KEY")
     secret_key = os.getenv("VOLCENGINE_SECRET_KEY")
@@ -36,36 +36,19 @@ def get_tts_token(region: str = "cn-beijing") -> str:
         session_token = cred.session_token
 
     res = ve_request(
-        request_body={"ProjectName": "default", "Filter": {}},
+        request_body={"ProjectName": "default", "OnlyAvailable": True, "Filter": {}},
         header={"X-Security-Token": session_token},
         action="ListApiKeys",
         ak=access_key,
         sk=secret_key,
-        service="ark",
-        version="2024-01-01",
+        service="speech_saas_prod",
+        version="2025-05-20",
         region=region,
         host="open.volcengineapi.com",
     )
     try:
-        first_api_key_id = res["Result"]["Items"][0]["Id"]
+        first_api_key_id = res["Result"]["APIKeys"][0]["APIKey"]
+        logger.info("Successfully fetching speech API Key.")
+        return first_api_key_id
     except KeyError:
-        raise ValueError(f"Failed to get ARK api key list: {res}")
-
-    # get raw api key
-    res = ve_request(
-        request_body={"Id": first_api_key_id},
-        header={"X-Security-Token": session_token},
-        action="GetRawApiKey",
-        ak=access_key,
-        sk=secret_key,
-        service="ark",
-        version="2024-01-01",
-        region=region,
-        host="open.volcengineapi.com",
-    )
-    try:
-        api_key = res["Result"]["ApiKey"]
-        logger.info("Successfully fetching ARK API Key.")
-        return api_key
-    except KeyError:
-        raise ValueError(f"Failed to get ARK api key: {res}")
+        raise ValueError(f"Failed to get speech api key list: {res}")
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index 9155fa04..f75a6493 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -63,7 +63,7 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     app_id = getenv("TOOL_TTS_APP_ID")
     api_key = getenv("TOOL_TTS_API_KEY")
     speaker = getenv(
-        "TOOL_TTS_SPEAKER", "zh_female_vv_mars_bigtts"
+        "TOOL_TTS_SPEAKER", "zh_female_vv_uranus_bigtts"
     )  # e.g. zh_female_vv_mars_bigtts
     if not all([app_id, api_key, speaker]):
         return {
@@ -75,8 +75,8 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
 
     headers = {
         "X-Api-App-Id": app_id,
-        "X-Api-Access-Key": api_key,
-        "X-Api-Resource-Id": "seed-tts-1.0",  # seed-tts-1.0 or seed-tts-2.0
+        "X-Api-Key": api_key,
+        "X-Api-Resource-Id": "seed-tts-2.0",  # seed-tts-1.0 or seed-tts-2.0
         "Content-Type": "application/json",
         "Connection": "keep-alive",
     }

From 368fbeb6b49f7f9cf8e5449241785d50a8071db7 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Sat, 8 Nov 2025 19:48:07 +0800
Subject: [PATCH 11/14] auth(veauth): implement speech token retrieval with
 credential fallback

---
 config.yaml.full                      |  5 +++++
 tests/tools/builtin_tools/test_tts.py |  9 ++++-----
 veadk/auth/veauth/speech_veauth.py    |  4 ++--
 veadk/configs/tool_configs.py         | 12 ++++++++++++
 veadk/tools/builtin_tools/tts.py      | 10 +++++-----
 5 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/config.yaml.full b/config.yaml.full
index 83d232ca..79ac65ca 100644
--- a/config.yaml.full
+++ b/config.yaml.full
@@ -46,6 +46,11 @@ tool:
   web_scraper: 
     endpoint: 
     api_key:    # `token`
+  # [optional] https://console.volcengine.com/speech/new/experience/tts
+  lark:
+    app_id:     # `app_id`
+    api_key:    # `app_secret`
+    speaker:    # `speaker`
   # [optional] https://open.larkoffice.com/app
   lark: 
     endpoint:   # `app_id`
diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
index b0521e00..ddd0b04a 100644
--- a/tests/tools/builtin_tools/test_tts.py
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -37,9 +37,9 @@ def setUp(self):
         self.patcher_env = patch.dict(
             "os.environ",
             {
-                "TOOL_TTS_APP_ID": "test_app_id",
-                "TOOL_TTS_API_KEY": "test_api_key",
-                "TOOL_TTS_SPEAKER": "test_speaker",
+                "TOOL_VESPEECH_APP_ID": "test_app_id",
+                "TOOL_VESPEECH_API_KEY": "test_api_key",
+                "TOOL_VESPEECH_SPEAKER": "test_speaker",
             },
         )
         self.patcher_env.start()
@@ -64,8 +64,7 @@ def test_tts_success(self, mock_session):
 
         # Assertions
         self.assertIsInstance(result, dict)
-        self.assertIn("text", result)
-        self.assertIn("audio_path", result)
+        self.assertIn("saved_audio_path", result)
         mock_session.return_value.post.assert_called_once()
         mock_response.close.assert_called_once()
 
diff --git a/veadk/auth/veauth/speech_veauth.py b/veadk/auth/veauth/speech_veauth.py
index 5992f601..bef342ac 100644
--- a/veadk/auth/veauth/speech_veauth.py
+++ b/veadk/auth/veauth/speech_veauth.py
@@ -36,9 +36,9 @@ def get_speech_token(region: str = "cn-beijing") -> str:
         session_token = cred.session_token
 
     res = ve_request(
-        request_body={"ProjectName": "default", "OnlyAvailable": True, "Filter": {}},
+        request_body={"ProjectName": "default", "OnlyAvailable": True},
         header={"X-Security-Token": session_token},
-        action="ListApiKeys",
+        action="ListAPIKeys",
         ak=access_key,
         sk=secret_key,
         service="speech_saas_prod",
diff --git a/veadk/configs/tool_configs.py b/veadk/configs/tool_configs.py
index 8cca6971..2abc6def 100644
--- a/veadk/configs/tool_configs.py
+++ b/veadk/configs/tool_configs.py
@@ -20,6 +20,7 @@
 
 from veadk.auth.veauth.prompt_pilot_veauth import PromptPilotVeAuth
 from veadk.auth.veauth.vesearch_veauth import VesearchVeAuth
+from veadk.auth.veauth.speech_veauth import get_speech_token
 
 
 class PromptPilotConfig(BaseModel):
@@ -38,5 +39,16 @@ def api_key(self) -> str:
         return os.getenv("TOOL_VESEARCH_API_KEY") or VesearchVeAuth().token
 
 
+class VeSpeechConfig(BaseSettings):
+    model_config = SettingsConfigDict(env_prefix="TOOL_VESPEECH_")
+
+    endpoint: int | str = ""
+
+    @cached_property
+    def api_key(self) -> str:
+        return os.getenv("TOOL_VESPEECH_API_KEY") or get_speech_token()
+
+
 class BuiltinToolConfigs(BaseModel):
     vesearch: VeSearchConfig = Field(default_factory=VeSearchConfig)
+    vespeech: VeSpeechConfig = Field(default_factory=VeSpeechConfig)
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index f75a6493..9918f775 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -23,7 +23,7 @@
 import tempfile
 from typing import Dict, Any
 from google.adk.tools import ToolContext
-from veadk.config import getenv
+from veadk.config import getenv, settings
 from veadk.utils.logger import get_logger
 from veadk.utils.audio_manager import AudioDeviceManager, AudioConfig
 
@@ -60,16 +60,16 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
     audio_save_path = ""
 
-    app_id = getenv("TOOL_TTS_APP_ID")
-    api_key = getenv("TOOL_TTS_API_KEY")
+    app_id = getenv("TOOL_VESPEECH_APP_ID")
     speaker = getenv(
-        "TOOL_TTS_SPEAKER", "zh_female_vv_uranus_bigtts"
+        "TOOL_VESPEECH_SPEAKER", "zh_female_vv_uranus_bigtts"
     )  # e.g. zh_female_vv_mars_bigtts
+    api_key = settings.tool.vespeech.api_key
     if not all([app_id, api_key, speaker]):
         return {
             "error": (
                 "Tool text_to_speech execution failed. Missing required env vars: "
-                "TOOL_TTS_APP_ID, TOOL_TTS_API_KEY, TOOL_TTS_SPEAKER"
+                "TOOL_VESPEECH_APP_ID, TOOL_VESPEECH_API_KEY, TOOL_VESPEECH_SPEAKER"
             )
         }
 

From 42d980dfbec53f8259e317b29853a931ef924972 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Mon, 10 Nov 2025 09:41:36 +0800
Subject: [PATCH 12/14] update config.yaml.full

---
 config.yaml.full | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.yaml.full b/config.yaml.full
index 79ac65ca..8756af6d 100644
--- a/config.yaml.full
+++ b/config.yaml.full
@@ -47,7 +47,7 @@ tool:
     endpoint: 
     api_key:    # `token`
   # [optional] https://console.volcengine.com/speech/new/experience/tts
-  lark:
+  text_to_speech:
     app_id:     # `app_id`
     api_key:    # `app_secret`
     speaker:    # `speaker`

From b0a781afcc821e9da4edd30a35ef3733feef23bb Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Mon, 10 Nov 2025 11:48:11 +0800
Subject: [PATCH 13/14] fix: move pyaudio from py core dependency to extension

---
 pyproject.toml                          |  4 +++-
 tests/auth/veauth/test_speech_veauth.py |  6 ++----
 veadk/tools/builtin_tools/tts.py        |  5 ++---
 veadk/utils/audio_manager.py            | 22 ++++++++++++++++++----
 4 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e97f8dd6..ebb032d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,6 @@ dependencies = [
     "pymysql>=1.1.1", # For MySQL database (short term memory)
     "opensearch-py==2.8.0",
     "filetype>=1.2.0",
-    "pyaudio>=0.2.14",
 ]
 
 [project.scripts]
@@ -55,6 +54,9 @@ database = [
     "tos>=2.8.4",                   # For TOS storage and Viking DB
     "mem0ai==0.1.118",              # For mem0
 ]
+tts = [
+    "pyaudio>=0.2.14",
+]
 eval = [
     "prometheus-client>=0.22.1",    # For exporting data to Prometheus pushgateway
     "deepeval>=3.2.6",              # For DeepEval-based evaluation
diff --git a/tests/auth/veauth/test_speech_veauth.py b/tests/auth/veauth/test_speech_veauth.py
index f4e4908d..74392b16 100644
--- a/tests/auth/veauth/test_speech_veauth.py
+++ b/tests/auth/veauth/test_speech_veauth.py
@@ -40,10 +40,9 @@ def test_get_speech_token_with_env_vars(monkeypatch):
             request_body={
                 "ProjectName": "default",
                 "OnlyAvailable": True,
-                "Filter": {},
             },
             header={"X-Security-Token": ""},
-            action="ListApiKeys",
+            action="ListAPIKeys",
             ak="test_access_key",
             sk="test_secret_key",
             service="speech_saas_prod",
@@ -85,10 +84,9 @@ def test_get_speech_token_with_vefaas_iam(monkeypatch):
             request_body={
                 "ProjectName": "default",
                 "OnlyAvailable": True,
-                "Filter": {},
             },
             header={"X-Security-Token": "vefaas_session_token"},
-            action="ListApiKeys",
+            action="ListAPIKeys",
             ak="vefaas_access_key",
             sk="vefaas_secret_key",
             service="speech_saas_prod",
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index 9918f775..ea9a5f14 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -18,7 +18,6 @@
 import base64
 import time
 import queue
-import pyaudio
 import threading
 import tempfile
 from typing import Dict, Any
@@ -34,7 +33,7 @@
     "format": "pcm",
     "channels": 1,
     "sample_rate": 16000,
-    "bit_size": pyaudio.paInt16,
+    "bit_size": 8,
 }
 
 output_audio_config = {
@@ -42,7 +41,7 @@
     "format": "pcm",
     "channels": 1,
     "sample_rate": 24000,
-    "bit_size": pyaudio.paInt16,
+    "bit_size": 8,
 }
 
 
diff --git a/veadk/utils/audio_manager.py b/veadk/utils/audio_manager.py
index 6f596b97..3cdf94c4 100644
--- a/veadk/utils/audio_manager.py
+++ b/veadk/utils/audio_manager.py
@@ -1,5 +1,19 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from typing import Optional
 
 import pyaudio
 
@@ -7,6 +21,7 @@
 @dataclass
 class AudioConfig:
     """audio config"""
+
     format: str
     bit_size: int
     channels: int
@@ -31,7 +46,7 @@ def open_input_stream(self) -> pyaudio.Stream:
             channels=self.input_config.channels,
             rate=self.input_config.sample_rate,
             input=True,
-            frames_per_buffer=self.input_config.chunk
+            frames_per_buffer=self.input_config.chunk,
         )
         return self.input_stream
 
@@ -41,7 +56,7 @@ def open_output_stream(self) -> pyaudio.Stream:
             channels=self.output_config.channels,
             rate=self.output_config.sample_rate,
             output=True,
-            frames_per_buffer=self.output_config.chunk
+            frames_per_buffer=self.output_config.chunk,
         )
         return self.output_stream
 
@@ -51,4 +66,3 @@ def cleanup(self) -> None:
                 stream.stop_stream()
                 stream.close()
         self.pyaudio.terminate()
-

From d8c68b4ae29f6acae1e2d60ae7d086aa34d4ead5 Mon Sep 17 00:00:00 2001
From: "wangyue.demon" <wangyue.demon@bytedance.com>
Date: Mon, 10 Nov 2025 15:08:52 +0800
Subject: [PATCH 14/14] fix: move pyaudio from py core dependency to extension

---
 pyproject.toml                        |  4 +-
 tests/tools/builtin_tools/test_tts.py |  8 +---
 veadk/tools/builtin_tools/tts.py      | 69 +++++++++++++--------------
 veadk/utils/audio_manager.py          | 29 ++++++++++-
 4 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ebb032d1..388f99b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,9 +54,7 @@ database = [
     "tos>=2.8.4",                   # For TOS storage and Viking DB
     "mem0ai==0.1.118",              # For mem0
 ]
-tts = [
-    "pyaudio>=0.2.14",
-]
+speech = []
 eval = [
     "prometheus-client>=0.22.1",    # For exporting data to Prometheus pushgateway
     "deepeval>=3.2.6",              # For DeepEval-based evaluation
diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
index ddd0b04a..73f34455 100644
--- a/tests/tools/builtin_tools/test_tts.py
+++ b/tests/tools/builtin_tools/test_tts.py
@@ -86,8 +86,7 @@ def test_tts_failure(self, mock_session):
         mock_session.return_value.post.assert_called_once()
 
     @patch("builtins.open")
-    @patch("pyaudio.PyAudio")
-    def test_handle_server_response_success(self, mock_pyaudio, mock_open):
+    def test_handle_server_response_success(self, mock_open):
         """Test successful response handling"""
         # Setup mock response
         mock_response = MagicMock()
@@ -96,15 +95,10 @@ def test_handle_server_response_success(self, mock_pyaudio, mock_open):
             json.dumps({"code": 20000000}),
         ]
 
-        # Setup mock audio stream
-        mock_stream = MagicMock()
-        mock_pyaudio.return_value.open.return_value = mock_stream
-
         # Call function
         handle_server_response(mock_response, "test.pcm")
 
         # Assertions
-        mock_stream.write.assert_called_with(b"audio_chunk")
         mock_open.assert_called_once_with("test.pcm", "wb")
 
     @patch("builtins.open")
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
index ea9a5f14..185fe826 100644
--- a/veadk/tools/builtin_tools/tts.py
+++ b/veadk/tools/builtin_tools/tts.py
@@ -24,26 +24,9 @@
 from google.adk.tools import ToolContext
 from veadk.config import getenv, settings
 from veadk.utils.logger import get_logger
-from veadk.utils.audio_manager import AudioDeviceManager, AudioConfig
 
 logger = get_logger(__name__)
 
-input_audio_config = {
-    "chunk": 3200,
-    "format": "pcm",
-    "channels": 1,
-    "sample_rate": 16000,
-    "bit_size": 8,
-}
-
-output_audio_config = {
-    "chunk": 3200,
-    "format": "pcm",
-    "channels": 1,
-    "sample_rate": 24000,
-    "bit_size": 8,
-}
-
 
 def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
@@ -57,7 +40,7 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
         A dict with the saved audio path.
     """
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
-    audio_save_path = ""
+    temp_dir = getenv("TOOL_VESPEECH_AUDIO_OUTPUT_PATH", tempfile.gettempdir())
 
     app_id = getenv("TOOL_VESPEECH_APP_ID")
     speaker = getenv(
@@ -106,10 +89,13 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
         logger.debug(f"Request TTS server with payload: {payload}.")
         response = session.post(url, headers=headers, json=payload, stream=True)
 
+        os.makedirs(temp_dir, exist_ok=True)
         with tempfile.NamedTemporaryFile(
-            suffix=".pcm", delete=False, dir=tempfile.gettempdir()
+            suffix=".pcm", delete=False, dir=temp_dir
         ) as tmp:
             audio_save_path = tmp.name  # e.g. /tmp/tts_12345.pcm
+            logger.debug(f"Created temporary file: {audio_save_path}")
+
         handle_server_response(response, audio_save_path)
 
     except Exception as e:
@@ -122,8 +108,6 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
             f"Execution Error: {e}"
         }
     finally:
-        if audio_save_path and os.path.exists(audio_save_path):
-            os.remove(audio_save_path)
         if response:
             response.close()
         session.close()
@@ -150,18 +134,29 @@ def handle_server_response(
     audio_queue = queue.Queue()
     total_audio_size = 0
 
-    audio_device = AudioDeviceManager(
-        AudioConfig(**input_audio_config), AudioConfig(**output_audio_config)
-    )
-
-    # init output stream
-    output_stream = audio_device.open_output_stream()
+    output_stream, player_thread = None, None
     stop_event = threading.Event()
-    player_thread = threading.Thread(
-        target=_audio_player_thread, args=(audio_queue, output_stream, stop_event)
-    )
-    player_thread.daemon = True
-    player_thread.start()
+    try:
+        from veadk.utils.audio_manager import (
+            AudioDeviceManager,
+            AudioConfig,
+            input_audio_config,
+            output_audio_config,
+        )
+
+        audio_device = AudioDeviceManager(
+            AudioConfig(**input_audio_config), AudioConfig(**output_audio_config)
+        )
+
+        # init output stream
+        output_stream = audio_device.open_output_stream()
+        player_thread = threading.Thread(
+            target=_audio_player_thread, args=(audio_queue, output_stream, stop_event)
+        )
+        player_thread.daemon = True
+        player_thread.start()
+    except Exception as e:
+        logger.error(f"Failed to initialize audio device: {e}")
 
     try:
         for chunk in response.iter_lines(decode_unicode=True):
@@ -194,10 +189,12 @@ def handle_server_response(
         logger.error(f"handle tts failed: {e}, response: {response}")
         raise
     finally:
-        audio_queue.join()
-        stop_event.set()
-        player_thread.join()
-        output_stream.close()
+        if output_stream:
+            audio_queue.join()
+            stop_event.set()
+            if player_thread and player_thread.is_alive():
+                player_thread.join()
+            output_stream.close()
 
 
 def _audio_player_thread(audio_queue, output_stream, stop_event):
diff --git a/veadk/utils/audio_manager.py b/veadk/utils/audio_manager.py
index 3cdf94c4..638ff889 100644
--- a/veadk/utils/audio_manager.py
+++ b/veadk/utils/audio_manager.py
@@ -15,7 +15,29 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pyaudio
+try:
+    import pyaudio
+
+    PYAUDIO_AVAILABLE = True
+except ImportError:
+    pyaudio = None
+    PYAUDIO_AVAILABLE = False
+
+input_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 16000,
+    "bit_size": pyaudio.paInt16,
+}
+
+output_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 24000,
+    "bit_size": pyaudio.paInt16,
+}
 
 
 @dataclass
@@ -33,6 +55,11 @@ class AudioDeviceManager:
     """audio device manager, handle audio input/output"""
 
     def __init__(self, input_config: AudioConfig, output_config: AudioConfig):
+        if not PYAUDIO_AVAILABLE:
+            raise RuntimeError(
+                "pyaudio is not installed. Please install it via: "
+                "pip install veadk-python[speech]"
+            )
         self.input_config = input_config
         self.output_config = output_config
         self.pyaudio = pyaudio.PyAudio()