Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
ad4903e
Speech pipeline POC
michalkulakowski Sep 3, 2025
6eb0d4b
fix
michalkulakowski Sep 3, 2025
4a1d621
update
michalkulakowski Sep 3, 2025
ec744ef
fix
michalkulakowski Sep 4, 2025
b188a34
fix
michalkulakowski Sep 4, 2025
0727122
fix
michalkulakowski Sep 11, 2025
07082c0
fix
michalkulakowski Sep 17, 2025
8337880
fix build
dtrawins Oct 3, 2025
8ec525a
spelling
dtrawins Oct 3, 2025
d8246e2
style
dtrawins Oct 3, 2025
3a6b615
style
dtrawins Oct 3, 2025
1d76121
demo and export script
dtrawins Oct 3, 2025
4844935
style
dtrawins Oct 3, 2025
4e09039
fix
dtrawins Oct 3, 2025
aefc737
fix
dtrawins Oct 5, 2025
94ffc06
demo init
dtrawins Oct 6, 2025
b0810dd
fix unit tests
dtrawins Oct 6, 2025
b400021
fix
dtrawins Oct 6, 2025
0f409c1
windows fix
dtrawins Oct 6, 2025
1bf5090
gpu fix
dtrawins Oct 6, 2025
0cc4938
stype
dtrawins Oct 6, 2025
5e669df
stype
dtrawins Oct 6, 2025
ef3cdba
Update README.md
dtrawins Oct 6, 2025
4cea415
cleanup
michalkulakowski Oct 22, 2025
359b9c9
style
michalkulakowski Oct 22, 2025
3e65037
fix
michalkulakowski Oct 22, 2025
288466c
Add calculator
michalkulakowski Oct 22, 2025
dc21e31
fix
michalkulakowski Oct 22, 2025
e4b5e9c
style
michalkulakowski Oct 22, 2025
4ed9ce4
fix
michalkulakowski Oct 22, 2025
ec57fee
fix
michalkulakowski Oct 22, 2025
9b9af0b
fix
michalkulakowski Oct 22, 2025
4bf65f4
fix
michalkulakowski Oct 22, 2025
cc37fd2
review
michalkulakowski Oct 24, 2025
0b50bf5
review
michalkulakowski Oct 24, 2025
4519280
style
michalkulakowski Oct 24, 2025
28ed221
review
michalkulakowski Oct 24, 2025
a2f9e00
style
michalkulakowski Oct 24, 2025
6bbbbbf
style
michalkulakowski Oct 24, 2025
60809ac
fix
michalkulakowski Oct 24, 2025
1e093c3
style
michalkulakowski Oct 24, 2025
d55f4b3
fix
michalkulakowski Oct 24, 2025
9c0e649
Update audio_utils.cpp
michalkulakowski Oct 24, 2025
74a9817
Update audio_utils.cpp
michalkulakowski Oct 24, 2025
c7ec173
fix
michalkulakowski Oct 28, 2025
fa28c80
fix
michalkulakowski Oct 28, 2025
0919ee2
style
michalkulakowski Oct 28, 2025
184da98
fix
michalkulakowski Oct 28, 2025
054fef3
fix
michalkulakowski Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -636,3 +636,19 @@ cc_library(
)
""",
)

new_git_repository(
name = "dr_libs",
remote = "https://github.com/mackron/dr_libs",
commit = "24d738be2349fd4b6fe50eeaa81f5bd586267fd0",
build_file_content = """
cc_library(
name = "dr",
hdrs = ["dr_flac.h", "dr_mp3.h", "dr_wav.h"],
visibility = ["//visibility:public"],
local_defines = [
],
)
""",
)

25 changes: 25 additions & 0 deletions demos/audio/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Audio endpoints


## Audio synthesis

python export_model.py text2speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan --weight-format fp16

docker run -p 8000:8000 -d -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000

curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.wav


## Audio transcription

python export_model.py speech2text --source_model openai/whisper-large-v2 --weight-format fp16 --target_device GPU


docker run -p 8000:8000 -it --device /dev/dri -u 0 -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000


curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper"




33 changes: 33 additions & 0 deletions demos/audio/openai_speech2text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pathlib import Path
from openai import OpenAI

filename = "speech.wav"
url="http://localhost:8125/v3"


speech_file_path = Path(__file__).parent / filename
client = OpenAI(base_url=url, api_key="not_used")

audio_file = open(filename, "rb")
transcript = client.audio.transcriptions.create(
model="openai/whisper-large-v2",
file=audio_file
)

print(transcript)
36 changes: 36 additions & 0 deletions demos/audio/openai_text2speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pathlib import Path
from openai import OpenAI

prompt = "Intel Corporation is an American multinational technology company headquartered in Santa Clara, California.[3] Intel designs, manufactures, and sells computer components such as central processing units (CPUs) and related products for business and consumer markets. It was the world's third-largest semiconductor chip manufacturer by revenue in 2024[4] and has been included in the Fortune 500 list of the largest United States corporations by revenue since 2007. It was one of the first companies listed on Nasdaq. Since 2025, it is partially owned by the United States government."
filename = "speech.wav"
url="http://localhost:8125/v3"


speech_file_path = Path(__file__).parent / "speech.wav"
client = OpenAI(base_url=url, api_key="not_used")

with client.audio.speech.with_streaming_response.create(
model="microsoft/speecht5_tts",
voice="unused",
input=prompt
) as response:
response.stream_to_file(speech_file_path)


print("Generation finished")
83 changes: 81 additions & 2 deletions demos/common/export_models/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,55 @@ def add_common_arguments(parser):
parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt')
parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps')
parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps')

parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
add_common_arguments(parser_text2speech)
parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')

parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
add_common_arguments(parser_speech2text)
parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
args = vars(parser.parse_args())

tts_graph_template = """
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
name: "TtsExecutor"
input_side_packet: "TTS_NODE_RESOURCES:tts_servable"
calculator: "TtsCalculator"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.TtsCalculatorOptions]: {
models_path: "{{model_path}}",
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
device: "{{target_device|default("CPU", true)}}"
}
}
}
"""

stt_graph_template = """
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
name: "SttExecutor"
input_side_packet: "STT_NODE_RESOURCES:stt_servable"
calculator: "SttCalculator"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.SttCalculatorOptions]: {
models_path: "{{model_path}}",
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
device: "{{target_device|default("CPU", true)}}"
}
}
}
"""

embedding_graph_ov_template = """
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
Expand Down Expand Up @@ -457,7 +504,34 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting text2speech model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
if os.system(optimum_command):
raise ValueError("Failed to export text2speech model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(tts_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_speech2text_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting speech2text model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
if os.system(optimum_command):
raise ValueError("Failed to export speech2text model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(stt_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length):
destination_path = os.path.join(model_repository_path, model_name)
Expand Down Expand Up @@ -585,14 +659,19 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])

elif args['task'] == 'embeddings_ov':
export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate'])
export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters)

elif args['task'] == 'rerank':
export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length'])

elif args['task'] == 'rerank_ov':
export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length'])

elif args['task'] == 'text2speech':
export_text2speech_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])

elif args['task'] == 'speech2text':
export_speech2text_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'])
elif args['task'] == 'image_generation':
template_parameters = {k: v for k, v in args.items() if k in [
'ov_cache_dir',
Expand Down
3 changes: 3 additions & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,9 @@ ovms_cc_library(
"//conditions:default": [],
"//:not_disable_mediapipe" : [
"//src/image_gen:image_gen_calculator",
"//src/audio/speech_to_text:stt_calculator",
"//src/audio/text_to_speech:tts_calculator",
"//src/audio:audio_utils",
"//src/image_gen:imagegen_init",
"//src/llm:openai_completions_api_handler",
"//src/embeddings:embeddingscalculator_ov",
Expand Down
31 changes: 31 additions & 0 deletions src/audio/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
load("//:common_settings.bzl", "ovms_cc_library")

ovms_cc_library(
name = "audio_utils",
hdrs = ["audio_utils.hpp"],
srcs = ["audio_utils.cpp"],
visibility = ["//visibility:public"],
deps = [
"//src:libovmslogging",
"//src/port:dr_audio",
"//src:libovmstimer",
],
alwayslink = 1,
)
Loading