openvinotoolkit · michalkulakowski · Oct 28, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/WORKSPACE b/WORKSPACE
@@ -636,3 +636,19 @@ cc_library(
 )
 """,
 )
+
+new_git_repository(
+    name = "dr_libs",
+    remote = "https://github.com/mackron/dr_libs",
+    commit = "24d738be2349fd4b6fe50eeaa81f5bd586267fd0",
+    build_file_content = """
+cc_library(
+    name = "dr",
+    hdrs = ["dr_flac.h", "dr_mp3.h", "dr_wav.h"],
+    visibility = ["//visibility:public"],
+    local_defines = [
+    ],
+)
+""",
+)
+
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -0,0 +1,25 @@
+# Audio endpoints
+
+
+## Audio synthesis
+
+python export_model.py text2speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan --weight-format fp16
+
+docker run -p 8000:8000 -d -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000
+
+curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.wav
+
+
+## Audio transcription
+
+python export_model.py speech2text --source_model openai/whisper-large-v2  --weight-format fp16 --target_device GPU
+
+
+docker run -p 8000:8000 -it --device /dev/dri -u 0 -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000
+
+
+curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper"
+
+
+
+
diff --git a/demos/audio/openai_speech2text.py b/demos/audio/openai_speech2text.py
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pathlib import Path
+from openai import OpenAI
+
+filename = "speech.wav"
+url="http://localhost:8125/v3"
+
+
+speech_file_path = Path(__file__).parent / filename
+client = OpenAI(base_url=url, api_key="not_used")
+
+audio_file = open(filename, "rb")
+transcript = client.audio.transcriptions.create(
+  model="openai/whisper-large-v2",
+  file=audio_file
+)
+
+print(transcript)
diff --git a/demos/audio/openai_text2speech.py b/demos/audio/openai_text2speech.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pathlib import Path
+from openai import OpenAI
+
+prompt = "Intel Corporation is an American multinational technology company headquartered in Santa Clara, California.[3] Intel designs, manufactures, and sells computer components such as central processing units (CPUs) and related products for business and consumer markets. It was the world's third-largest semiconductor chip manufacturer by revenue in 2024[4] and has been included in the Fortune 500 list of the largest United States corporations by revenue since 2007. It was one of the first companies listed on Nasdaq. Since 2025, it is partially owned by the United States government."
+filename = "speech.wav"
+url="http://localhost:8125/v3"
+
+
+speech_file_path = Path(__file__).parent / "speech.wav"
+client = OpenAI(base_url=url, api_key="not_used")
+
+with client.audio.speech.with_streaming_response.create(
+  model="microsoft/speecht5_tts",
+  voice="unused",
+  input=prompt
+) as response:
+  response.stream_to_file(speech_file_path)
+
+
+print("Generation finished")
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -85,8 +85,55 @@ def add_common_arguments(parser):
 parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt')
 parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps')
 parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps')
+
+parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
+add_common_arguments(parser_text2speech)
+parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
+parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')
+
+parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
+add_common_arguments(parser_speech2text)
+parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
 args = vars(parser.parse_args())
 
+tts_graph_template = """
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+  name: "TtsExecutor"
+  input_side_packet: "TTS_NODE_RESOURCES:tts_servable"
+  calculator: "TtsCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+    [type.googleapis.com / mediapipe.TtsCalculatorOptions]: {
+      models_path: "{{model_path}}",
+      plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
+      device: "{{target_device|default("CPU", true)}}"
+    }
+  }
+}
+"""
+
+stt_graph_template = """
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+  name: "SttExecutor"
+  input_side_packet: "STT_NODE_RESOURCES:stt_servable"
+  calculator: "SttCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+    [type.googleapis.com / mediapipe.SttCalculatorOptions]: {
+      models_path: "{{model_path}}",
+      plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
+      device: "{{target_device|default("CPU", true)}}"
+    }
+  }
+}
+"""
+
 embedding_graph_ov_template = """
 input_stream: "REQUEST_PAYLOAD:input"
 output_stream: "RESPONSE_PAYLOAD:output"
@@ -457,7 +504,34 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
-    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
+
+def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
+    destination_path = os.path.join(model_repository_path, model_name)
+    print("Exporting text2speech model to ",destination_path)
+    if not os.path.isdir(destination_path) or args['overwrite_models']:
+        optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
+        if os.system(optimum_command):
+            raise ValueError("Failed to export text2speech model", source_model)
+    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(tts_graph_template)
+    graph_content = gtemplate.render(model_path="./", **task_parameters)
+    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
+        f.write(graph_content)
+    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
+    add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
+
+def export_speech2text_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
+    destination_path = os.path.join(model_repository_path, model_name)
+    print("Exporting speech2text model to ",destination_path)
+    if not os.path.isdir(destination_path) or args['overwrite_models']:
+        optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
+        if os.system(optimum_command):
+            raise ValueError("Failed to export speech2text model", source_model)
+    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(stt_graph_template)
+    graph_content = gtemplate.render(model_path="./", **task_parameters)
+    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
+        f.write(graph_content)
+    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
+    add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
 
 def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length):
     destination_path = os.path.join(model_repository_path, model_name)
@@ -585,14 +659,19 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
     export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
 
 elif args['task'] == 'embeddings_ov':
-    export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'],  args['precision'], template_parameters, args['config_file_path'], args['truncate'])
+    export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'],  args['precision'], template_parameters)
 
 elif args['task'] == 'rerank':
     export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length'])
 
 elif args['task'] == 'rerank_ov':
     export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length'])
 
+elif args['task'] == 'text2speech':
+    export_text2speech_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
+
+elif args['task'] == 'speech2text':
+    export_speech2text_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'])
 elif args['task'] == 'image_generation':
     template_parameters = {k: v for k, v in args.items() if k in [
         'ov_cache_dir',

diff --git a/src/BUILD b/src/BUILD
@@ -558,6 +558,9 @@ ovms_cc_library(
             "//conditions:default": [],
             "//:not_disable_mediapipe" : [
                 "//src/image_gen:image_gen_calculator",
+                "//src/audio/speech_to_text:stt_calculator",
+                "//src/audio/text_to_speech:tts_calculator",
+                "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",
                 "//src/embeddings:embeddingscalculator_ov",

diff --git a/src/audio/BUILD b/src/audio/BUILD
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
+load("//:common_settings.bzl", "ovms_cc_library")
+
+ovms_cc_library(
+    name = "audio_utils",
+    hdrs = ["audio_utils.hpp"],
+    srcs = ["audio_utils.cpp"],
+    visibility = ["//visibility:public"],
+        deps = [
+        "//src:libovmslogging",
+        "//src/port:dr_audio",
+        "//src:libovmstimer",
+    ],
+    alwayslink = 1,
+)