Skip to content

Commit 0d1cea9

Browse files
Transcript&speech endpoints (#3719)
### 🛠 Summary CVS-174567 CVS-174596 POC productization #3683 --------- Co-authored-by: Dariusz Trawinski <[email protected]>
1 parent 0aa9987 commit 0d1cea9

29 files changed

+1127
-30
lines changed

WORKSPACE

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,3 +636,19 @@ cc_library(
636636
)
637637
""",
638638
)
639+
640+
new_git_repository(
641+
name = "dr_libs",
642+
remote = "https://github.com/mackron/dr_libs",
643+
commit = "24d738be2349fd4b6fe50eeaa81f5bd586267fd0",
644+
build_file_content = """
645+
cc_library(
646+
name = "dr",
647+
hdrs = ["dr_flac.h", "dr_mp3.h", "dr_wav.h"],
648+
visibility = ["//visibility:public"],
649+
local_defines = [
650+
],
651+
)
652+
""",
653+
)
654+

demos/audio/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Audio endpoints
2+
3+
4+
## Audio synthesis
5+
6+
python export_model.py text2speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan --weight-format fp16
7+
8+
docker run -p 8000:8000 -d -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000
9+
10+
curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.wav
11+
12+
13+
## Audio transcription
14+
15+
python export_model.py speech2text --source_model openai/whisper-large-v2 --weight-format fp16 --target_device GPU
16+
17+
18+
docker run -p 8000:8000 -it --device /dev/dri -u 0 -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000
19+
20+
21+
curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper"
22+
23+
24+
25+

demos/audio/openai_speech2text.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#
2+
# Copyright (c) 2025 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
from pathlib import Path
18+
from openai import OpenAI
19+
20+
filename = "speech.wav"
21+
url="http://localhost:8125/v3"
22+
23+
24+
speech_file_path = Path(__file__).parent / filename
25+
client = OpenAI(base_url=url, api_key="not_used")
26+
27+
audio_file = open(filename, "rb")
28+
transcript = client.audio.transcriptions.create(
29+
model="openai/whisper-large-v2",
30+
file=audio_file
31+
)
32+
33+
print(transcript)

demos/audio/openai_text2speech.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#
2+
# Copyright (c) 2025 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
from pathlib import Path
18+
from openai import OpenAI
19+
20+
prompt = "Intel Corporation is an American multinational technology company headquartered in Santa Clara, California.[3] Intel designs, manufactures, and sells computer components such as central processing units (CPUs) and related products for business and consumer markets. It was the world's third-largest semiconductor chip manufacturer by revenue in 2024[4] and has been included in the Fortune 500 list of the largest United States corporations by revenue since 2007. It was one of the first companies listed on Nasdaq. Since 2025, it is partially owned by the United States government."
21+
filename = "speech.wav"
22+
url="http://localhost:8125/v3"
23+
24+
25+
speech_file_path = Path(__file__).parent / "speech.wav"
26+
client = OpenAI(base_url=url, api_key="not_used")
27+
28+
with client.audio.speech.with_streaming_response.create(
29+
model="microsoft/speecht5_tts",
30+
voice="unused",
31+
input=prompt
32+
) as response:
33+
response.stream_to_file(speech_file_path)
34+
35+
36+
print("Generation finished")

demos/common/export_models/export_model.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,55 @@ def add_common_arguments(parser):
8585
parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt')
8686
parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps')
8787
parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps')
88+
89+
parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
90+
add_common_arguments(parser_text2speech)
91+
parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
92+
parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')
93+
94+
parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
95+
add_common_arguments(parser_speech2text)
96+
parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
8897
args = vars(parser.parse_args())
8998

99+
tts_graph_template = """
100+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
101+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
102+
node {
103+
name: "TtsExecutor"
104+
input_side_packet: "TTS_NODE_RESOURCES:tts_servable"
105+
calculator: "TtsCalculator"
106+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
107+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
108+
node_options: {
109+
[type.googleapis.com / mediapipe.TtsCalculatorOptions]: {
110+
models_path: "{{model_path}}",
111+
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
112+
device: "{{target_device|default("CPU", true)}}"
113+
}
114+
}
115+
}
116+
"""
117+
118+
stt_graph_template = """
119+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
120+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
121+
node {
122+
name: "SttExecutor"
123+
input_side_packet: "STT_NODE_RESOURCES:stt_servable"
124+
calculator: "SttCalculator"
125+
input_stream: "HTTP_REQUEST_PAYLOAD:input"
126+
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
127+
node_options: {
128+
[type.googleapis.com / mediapipe.SttCalculatorOptions]: {
129+
models_path: "{{model_path}}",
130+
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
131+
device: "{{target_device|default("CPU", true)}}"
132+
}
133+
}
134+
}
135+
"""
136+
90137
embedding_graph_ov_template = """
91138
input_stream: "REQUEST_PAYLOAD:input"
92139
output_stream: "RESPONSE_PAYLOAD:output"
@@ -457,7 +504,34 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
457504
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
458505
f.write(graph_content)
459506
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
460-
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
507+
508+
def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
509+
destination_path = os.path.join(model_repository_path, model_name)
510+
print("Exporting text2speech model to ",destination_path)
511+
if not os.path.isdir(destination_path) or args['overwrite_models']:
512+
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
513+
if os.system(optimum_command):
514+
raise ValueError("Failed to export text2speech model", source_model)
515+
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(tts_graph_template)
516+
graph_content = gtemplate.render(model_path="./", **task_parameters)
517+
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
518+
f.write(graph_content)
519+
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
520+
add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
521+
522+
def export_speech2text_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
523+
destination_path = os.path.join(model_repository_path, model_name)
524+
print("Exporting speech2text model to ",destination_path)
525+
if not os.path.isdir(destination_path) or args['overwrite_models']:
526+
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
527+
if os.system(optimum_command):
528+
raise ValueError("Failed to export speech2text model", source_model)
529+
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(stt_graph_template)
530+
graph_content = gtemplate.render(model_path="./", **task_parameters)
531+
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
532+
f.write(graph_content)
533+
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
534+
add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
461535

462536
def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length):
463537
destination_path = os.path.join(model_repository_path, model_name)
@@ -585,14 +659,19 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
585659
export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
586660

587661
elif args['task'] == 'embeddings_ov':
588-
export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate'])
662+
export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters)
589663

590664
elif args['task'] == 'rerank':
591665
export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length'])
592666

593667
elif args['task'] == 'rerank_ov':
594668
export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length'])
595669

670+
elif args['task'] == 'text2speech':
671+
export_text2speech_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
672+
673+
elif args['task'] == 'speech2text':
674+
export_speech2text_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'])
596675
elif args['task'] == 'image_generation':
597676
template_parameters = {k: v for k, v in args.items() if k in [
598677
'ov_cache_dir',

src/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,9 @@ ovms_cc_library(
558558
"//conditions:default": [],
559559
"//:not_disable_mediapipe" : [
560560
"//src/image_gen:image_gen_calculator",
561+
"//src/audio/speech_to_text:stt_calculator",
562+
"//src/audio/text_to_speech:tts_calculator",
563+
"//src/audio:audio_utils",
561564
"//src/image_gen:imagegen_init",
562565
"//src/llm:openai_completions_api_handler",
563566
"//src/embeddings:embeddingscalculator_ov",

src/audio/BUILD

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#
2+
# Copyright (c) 2025 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
18+
load("//:common_settings.bzl", "ovms_cc_library")
19+
20+
ovms_cc_library(
21+
name = "audio_utils",
22+
hdrs = ["audio_utils.hpp"],
23+
srcs = ["audio_utils.cpp"],
24+
visibility = ["//visibility:public"],
25+
deps = [
26+
"//src:libovmslogging",
27+
"//src/port:dr_audio",
28+
"//src:libovmstimer",
29+
],
30+
alwayslink = 1,
31+
)

0 commit comments

Comments
 (0)