Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mcp/server/mcp_server_speech/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
91 changes: 91 additions & 0 deletions mcp/server/mcp_server_speech/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Speech Model Context Protocol Server

An MCP server implementation for speech of volcengine

## Features

### Tools

- **asr**
Automatic Speech Recognition: Converts audio to text.
- Args:
- content: url or absolute path of the audio file to transcribe.
- Returns:
- Asr text
- **tts**
Text-to-Speech: Synthesizes text into audio.
- Args:
- text: The text to synthesize into speech.
- speed: Speech speed (e.g., 1.0 for normal). default: 1.0.
- encoding: Desired audio output format (e.g., 'mp3', 'wav'). default: 'mp3'.
- Returns:
- Return the path of audio file.

## Configuration

The server requires the following environment variables to be set:

- `VOLC_APPID`: Required, The APP ID for the VolcEngine.
- `VOLC_TOKEN`: Required, The Access Token for the VolcEngine.
- `VOLC_VOICE_TYPE`: Optional, Large speech synthesis model service voice_type, default is 'zh_female_meilinvyou_moon_bigtts'
- `VOLC_CLUSTER`: Required, Large speech synthesis model service cluster ID

The services that need to be activated on Volcengine are: [Large speech synthesis model](https://console.volcengine.com/speech/service/10007)、[Streaming speech recognition large model](<<https://console.volcengine.com/speech/service/10011)、[Large> model for audio file recognition>](<https://console.volcengine.com/speech/service/10012>)

You can set these environment variables in your shell.

### MCP Settings Configuration

To add this server to your MCP configuration, add the following to your MCP settings file:

```json
{
"mcpServers": {
"speech-mcp-server": {
"command": "uv",
"args": [
"--directory",
"/ABSOLUTE/PATH/TO/PARENT/FOLDER/src/mcp_server_speech",
"run",
"main.py"
]
}
}
}
```

or

```json
{
"mcpServers": {
"speech-mcp-server": {
"command": "uvx",
"args": [
"--from",
"git+https://github.com/volcengine/ai-app-lab#subdirectory=mcp/server/mcp_server_speech",
"mcp-server-speech"
],
"env": {
"VOLC_APPID": "your appid",
"VOLC_TOKEN": "your token",
"VOLC_VOICE_TYPE": "tts voice type",
"VOLC_CLUSTER": "tts cluster id",
}
}
}
}
```

## Usage

### Running the Server

```bash
# Run the server with stdio transport (default)
python -m mcp_server_speech [--transport/-t {sse,stdio}]
```

## License

This library is licensed under the MIT-0 License. See the LICENSE file.
45 changes: 45 additions & 0 deletions mcp/server/mcp_server_speech/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "mcp-server-speech"
version = "0.1.0"
description = "An MCP server providing speech-related tools like ASR and TTS."
readme = "README.md"
requires-python = ">=3.12"
license = { text = "MIT" }
authors = [{ name = "lixiangrong", email = "[email protected]" }]
dependencies = [
"aiofiles>=24.1.0",
"mcp[cli]>=1.6.0",
"pydub>=0.25.1",
"python-dotenv>=1.1.0",
"requests>=2.32.3",
"websockets<14",
]

[project.scripts]
mcp-server-speech = "mcp_server_speech.main:main"

[tool.hatch.metadata]
allow-direct-references = true

[tool.ruff]
# Add ruff linting/formatting configurations here
line-length = 88
lint.select = [
"E",
"F",
"W",
"I",
"N",
"UP",
"B",
"A",
"C4",
"T20",
"SIM",
"PTH",
]
lint.ignore = ["E501"] # Ignore line length errors if needed
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mcp_server_speech.main import main # Updated import path

if __name__ == "__main__":
main()
61 changes: 61 additions & 0 deletions mcp/server/mcp_server_speech/src/mcp_server_speech/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
import os
from pathlib import Path

log_dir = Path(__file__).resolve().parent.parent.parent / "logs"
log_dir.mkdir(exist_ok=True)

log_formatter = logging.Formatter(
"%(asctime)s - %(name)s - [%(filename)s:%(lineno)d]- %(levelname)s - %(message)s"
)

console_handler = logging.StreamHandler()
console_handler.setFormatter(log_formatter)

# Create file handler
log_file = os.getenv("LOG_FILE_PATH", str(log_dir / "speech.log"))
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setFormatter(log_formatter)

# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
root_logger.addHandler(console_handler)

# Enable file logging based on environment variable
if os.getenv("ENABLE_FILE_LOGGING", "false").lower() == "true":
root_logger.addHandler(file_handler)

logger = logging.getLogger(__name__)

# load environment variables

VOLC_APPID = None
VOLC_TOKEN = None
VOLC_CLUSTER = None
VOLC_VOICE_TYPE = None


def load_config():
global VOLC_APPID, VOLC_TOKEN, VOLC_CLUSTER, VOLC_VOICE_TYPE

VOLC_APPID = os.getenv("VOLC_APPID")
logger.info(f"VOLC_APPID loaded: {VOLC_APPID}") # Log loaded value
VOLC_TOKEN = os.getenv("VOLC_TOKEN")
logger.info(f"VOLC_TOKEN loaded: {VOLC_TOKEN}") # Log loaded value
VOLC_CLUSTER = os.getenv("VOLC_CLUSTER")
logger.info(f"VOLC_CLUSTER loaded: {VOLC_CLUSTER}") # Log loaded value
VOLC_VOICE_TYPE = os.getenv("VOLC_VOICE_TYPE", "zh_female_meilinvyou_moon_bigtts")
logger.info(f"VOLC_VOICE_TYPE loaded: {VOLC_VOICE_TYPE}") # Log loaded value

# Check if required environment variables are set
if not all([VOLC_APPID, VOLC_TOKEN, VOLC_CLUSTER]):
logger.error(
"Missing required environment variables: VOLC_APPID, VOLC_TOKEN, VOLC_CLUSTER"
)
raise ValueError(
"Missing required environment variables: VOLC_APPID, VOLC_TOKEN, VOLC_CLUSTER"
)


load_config()
33 changes: 33 additions & 0 deletions mcp/server/mcp_server_speech/src/mcp_server_speech/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
import logging

from mcp_server_speech.server import mcp

logger = logging.getLogger(__name__)


def main():
parser = argparse.ArgumentParser(description="Run the Speech MCP Server")
parser.add_argument(
"--transport",
"-t",
choices=["sse", "stdio"],
default="stdio",
help="Transport protocol to use (sse or stdio)",
)

args = parser.parse_args()
mcp.transport = args.transport # Store transport on mcp object

try:
logger.info(f"Starting Speech MCP Server with transport: {mcp.transport}")
mcp.run(transport=mcp.transport)
except KeyboardInterrupt:
logger.info("Speech MCP Server stopped by user.")
except Exception as e:
logger.error(f"Error starting Speech MCP Server: {str(e)}")
raise


if __name__ == "__main__":
main()
51 changes: 51 additions & 0 deletions mcp/server/mcp_server_speech/src/mcp_server_speech/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from enum import Enum

from pydantic import BaseModel, Field

# --- Pydantic Models for Tool Inputs/Outputs ---


class AudioSourceType(Enum):
URL = "url"
FILE = "file"


class AsrInputArgs(BaseModel):
"""Arguments for the ASR tool."""

source: str = Field(
..., description="Path to the audio file or URL of the audio stream."
)
source_type: AudioSourceType = Field(
..., description="Type of the audio source (e.g., 'file', 'url')."
)
options: dict | None = Field(
None, description="Additional options for ASR processing."
)


class AsrOutputResult(BaseModel):
"""Output of the ASR tool."""

text: str = Field(..., description="The recognized text from the audio.")


class TtsInputArgs(BaseModel):
"""Arguments for the TTS tool."""

text: str = Field(..., description="The text to synthesize into speech.")
speed: float = Field(1.0, description="Speech speed (e.g., 1.0 for normal).")
encoding: str = Field(
"mp3", description="Desired audio output format (e.g., 'mp3', 'wav')."
)


class TtsOutputResult(BaseModel):
"""Output of the TTS tool."""

format: str = Field(
..., description="The format of the audio data (e.g., 'mp3', 'wav')."
)
file_path: str = Field(
"", description="The path to the saved audio file (if applicable)."
)
105 changes: 105 additions & 0 deletions mcp/server/mcp_server_speech/src/mcp_server_speech/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import logging
import os

from mcp.server.fastmcp import FastMCP
from pydantic import Field

from mcp_server_speech.models import (
AsrInputArgs,
TtsInputArgs,
TtsOutputResult,
)
from mcp_server_speech.services.asr import AsrService, AudioSourceType
from mcp_server_speech.services.tts import tts_request_handler

# Initialize FastMCP server
mcp = FastMCP("Speech MCP Server", port=int(os.getenv("PORT", "8000")))

logger = logging.getLogger(__name__)


@mcp.tool()
async def tts(
text: str = Field(..., description="The text to synthesize into speech."),
speed: float = Field(
1.0, description="Speech speed (e.g., 1.0 for normal). default: 1.0."
),
encoding: str = Field(
"mp3",
description="Desired audio output format (e.g., 'mp3', 'wav'). default: 'mp3'.",
),
) -> TtsOutputResult:
"""
Text-to-Speech: Synthesizes text into audio.
Return the path of audio file.
"""

# Parameter validation logic
if not text or text.strip() == "":
raise ValueError("The text parameter cannot be empty.")
if speed <= 0:
raise ValueError("Speed must be a positive value.")
if encoding not in ("mp3", "wav"):
raise ValueError("Encoding must be either 'mp3' or 'wav'.")

try:
result = await tts_request_handler(
TtsInputArgs(text=text, speed=speed, encoding=encoding)
)

return result
except ValueError as e:
logging.error(f"Value error in Text to Speech: {e}")
raise
except TimeoutError as e:
logging.error(f"Timeout error in Text to Speech: {e}")
raise
except Exception as e:
logging.error(f"Error Text to Speech: {e}")
raise


@mcp.tool()
async def asr(
content: str = Field(
...,
description="url or absolute path of the audio file to transcribe.",
),
) -> str:
"""
Automatic Speech Recognition: Converts audio to text.
"""

# Parameter validation logic
if not content or content.strip() == "":
raise ValueError("The content parameter cannot be empty.")

try:
service = AsrService()
source_type = service.detect_source_type(content) # Detect source type
result = None
options = None

if mcp.transport == "sse" and source_type == AudioSourceType.FILE:
return "Error: SSE transport does not support file input."

if mcp.transport == "stdio":
options = {"format": "mp3", "rate": 16000, "channel": 1, "bits": 16}

result = await service.recognize(
AsrInputArgs(source=content, source_type=source_type, options=options)
)

logger.info(f"asr result: {result}")

return result.text

except ValueError as e:
logging.error(f"Value error in Automatic Speech Recognition: {e}")
raise
except TimeoutError as e:
logging.error(f"Timeout error in Automatic Speech Recognition: {e}")
raise
except Exception as e:
logging.error(f"Error Automatic Speech Recognition: {e}")
raise
Empty file.
Loading