Skip to content

Commit 4b06972

Browse files
style
1 parent 4d58dc0 commit 4b06972

File tree

5 files changed

+13
-177
lines changed

5 files changed

+13
-177
lines changed

src/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,7 @@ ovms_cc_library(
560560
"//src/image_gen:image_gen_calculator",
561561
"//src/audio/speech_to_text:stt_calculator",
562562
"//src/audio/text_to_speech:tts_calculator",
563+
"//src/audio:audio_utils",
563564
"//src/image_gen:imagegen_init",
564565
"//src/llm:openai_completions_api_handler",
565566
"//src/embeddings:embeddingscalculator",

src/audio/speech_to_text/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ ovms_cc_library(
3535
"//src/port:dr_audio",
3636
":stt_servable",
3737
"//third_party:genai",
38-
"//src:libovmstimer",
38+
"//src/audio:audio_utils",
3939
],
4040
visibility = ["//visibility:public"],
4141
alwayslink = 1,

src/audio/speech_to_text/stt_calculator.cc

Lines changed: 1 addition & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#pragma GCC diagnostic pop
2525
#pragma warning(pop)
2626

27-
#include "src/timer.hpp"
27+
#include "src/audio/audio_utils.hpp"
2828
#include "src/http_payload.hpp"
2929
#include "src/logging.hpp"
3030
#include <mutex>
@@ -35,9 +35,6 @@
3535
#include "absl/strings/escaping.h"
3636
#include "absl/strings/str_cat.h"
3737
#pragma warning(pop)
38-
#define DR_WAV_IMPLEMENTATION
39-
#define DR_MP3_IMPLEMENTATION
40-
#include "src/port/dr_audio.hpp"
4138

4239
#include "stt_servable.hpp"
4340

@@ -52,145 +49,6 @@ namespace mediapipe {
5249

5350
const std::string STT_SESSION_SIDE_PACKET_TAG = "STT_NODE_RESOURCES";
5451

55-
#define PIPELINE_SUPPORTED_SAMPLE_RATE 16000
56-
57-
bool is_wav_buffer(const std::string buf) {
58-
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
59-
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
60-
SPDLOG_TRACE("is_wav_buffer: buf {}", buf.substr(0, 12));
61-
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
62-
return false;
63-
}
64-
65-
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
66-
SPDLOG_TRACE("is_wav_buffer: chunk_size {}", chunk_size);
67-
if (chunk_size + 8 != buf.size()) {
68-
return false;
69-
}
70-
71-
return true;
72-
}
73-
// https://github.com/openvinotoolkit/openvino.genai/blob/8698683535fe32b5e3cb6953000c4e0175841bd3/samples/c/whisper_speech_recognition/whisper_utils.c#L105
74-
float* resample_audio(const float* input,
75-
size_t input_length,
76-
float input_rate,
77-
float target_rate,
78-
size_t* output_length) {
79-
SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Input file sample rate: {}. Resampling to {} required", input_rate, target_rate);
80-
float ratio = input_rate / target_rate;
81-
*output_length = (size_t)(input_length / ratio);
82-
float* output = (float*)malloc(*output_length * sizeof(float));
83-
84-
if (!output) {
85-
return NULL;
86-
}
87-
88-
for (size_t i = 0; i < *output_length; i++) {
89-
float src_idx = i * ratio;
90-
size_t idx0 = (size_t)src_idx;
91-
size_t idx1 = idx0 + 1;
92-
93-
if (idx1 >= input_length) {
94-
output[i] = input[input_length - 1];
95-
} else {
96-
float frac = src_idx - idx0;
97-
output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac;
98-
}
99-
}
100-
101-
return output;
102-
}
103-
104-
enum : unsigned int {
105-
TENSOR_PREPARATION,
106-
RESAMPLING,
107-
TIMER_END
108-
};
109-
110-
ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) {
111-
Timer<TIMER_END> timer;
112-
timer.start(TENSOR_PREPARATION);
113-
drwav wav;
114-
auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr);
115-
if (result == false) {
116-
throw std::runtime_error("WAV file parsing failed");
117-
}
118-
if (wav.channels != 1 && wav.channels != 2) {
119-
drwav_uninit(&wav);
120-
throw std::runtime_error("WAV file must be mono or stereo");
121-
}
122-
123-
const uint64_t n =
124-
wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8ul);
125-
126-
std::vector<int16_t> pcm16;
127-
pcm16.resize(n * wav.channels);
128-
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
129-
drwav_uninit(&wav);
130-
131-
// convert to mono, float
132-
std::vector<float> pcmf32;
133-
pcmf32.resize(n);
134-
if (wav.channels == 1) {
135-
for (uint64_t i = 0; i < n; i++) {
136-
pcmf32[i] = float(pcm16[i]) / 32768.0f;
137-
}
138-
} else {
139-
for (uint64_t i = 0; i < n; i++) {
140-
pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
141-
}
142-
}
143-
timer.stop(TENSOR_PREPARATION);
144-
auto tensorPreparationTime = (timer.elapsed<std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000;
145-
SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Tensor preparation time: {} ms size: {}", tensorPreparationTime, pcmf32.size());
146-
if (wav.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
147-
return pcmf32;
148-
}
149-
150-
size_t output_length;
151-
timer.start(RESAMPLING);
152-
auto buffer = resample_audio(reinterpret_cast<float*>(pcmf32.data()), pcmf32.size(), wav.sampleRate, PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
153-
timer.stop(RESAMPLING);
154-
auto resamplingTime = (timer.elapsed<std::chrono::microseconds>(RESAMPLING)) / 1000;
155-
SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Resampling time: {} ms", resamplingTime);
156-
std::vector<float> output(buffer, buffer + output_length);
157-
return output;
158-
}
159-
160-
ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) {
161-
Timer<TIMER_END> timer;
162-
timer.start(TENSOR_PREPARATION);
163-
drmp3 mp3;
164-
auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr);
165-
if (result == 0) {
166-
throw std::runtime_error("MP3 file parsing failed");
167-
}
168-
169-
if (mp3.channels != 1 && mp3.channels != 2) {
170-
drmp3_uninit(&mp3);
171-
throw std::runtime_error("MP3 file must be mono or stereo");
172-
}
173-
const uint64_t n = mp3.totalPCMFrameCount;
174-
std::vector<float> pcmf32;
175-
pcmf32.resize(n * mp3.channels);
176-
drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data());
177-
drmp3_uninit(&mp3);
178-
timer.stop(TENSOR_PREPARATION);
179-
auto tensorPreparationTime = (timer.elapsed<std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000;
180-
SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Tensor preparation time: {} ms size: {}", tensorPreparationTime, pcmf32.size());
181-
if (mp3.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
182-
return pcmf32;
183-
}
184-
timer.start(RESAMPLING);
185-
size_t output_length;
186-
auto buffer = resample_audio(reinterpret_cast<float*>(pcmf32.data()), pcmf32.size(), mp3.sampleRate, PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
187-
timer.stop(RESAMPLING);
188-
auto resamplingTime = (timer.elapsed<std::chrono::microseconds>(RESAMPLING)) / 1000;
189-
SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Resampling time: {} ms", resamplingTime);
190-
std::vector<float> output(buffer, buffer + output_length);
191-
return output;
192-
}
193-
19452
class SttCalculator : public CalculatorBase {
19553
static const std::string INPUT_TAG_NAME;
19654
static const std::string OUTPUT_TAG_NAME;

src/audio/text_to_speech/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ ovms_cc_library(
3535
"//src/port:dr_audio",
3636
":tts_servable",
3737
"//third_party:genai",
38-
"//src:libovmstimer",
38+
"//src/audio:audio_utils",
3939
],
4040
visibility = ["//visibility:public"],
4141
alwayslink = 1,

src/audio/text_to_speech/tts_calculator.cc

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#pragma GCC diagnostic pop
2525
#pragma warning(pop)
2626

27-
#include "src/timer.hpp"
27+
#include "src/audio/audio_utils.hpp"
2828
#include "src/http_payload.hpp"
2929
#include "src/logging.hpp"
3030
#include <mutex>
@@ -105,40 +105,17 @@ class TtsCalculator : public CalculatorBase {
105105
return absl::InvalidArgumentError("streaming is not supported");
106106
}
107107
std::unique_lock lock(pipe->ttsPipelineMutex);
108-
auto gen_speech = pipe->ttsPipeline->generate(inputIt->value.GetString());
109-
110-
enum : unsigned int {
111-
OUTPUT_PREPARATION,
112-
TIMER_END
113-
};
114-
Timer<TIMER_END> timer;
115-
timer.start(OUTPUT_PREPARATION);
116-
drwav_data_format format;
117-
format.container = drwav_container_riff;
118-
format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
119-
format.channels = 1;
120-
format.sampleRate = 16000; // assume it is always 16 KHz
121-
format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth();
122-
drwav wav;
123-
void* ppData;
124-
size_t pDataSize;
125-
auto waveform_size = gen_speech.speeches[0].get_size();
126-
size_t total_samples = waveform_size * format.channels;
127-
ov::Tensor cpu_tensor(gen_speech.speeches[0].get_element_type(), gen_speech.speeches[0].get_shape());
108+
auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
109+
auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
110+
auto speechSize = generatedSpeech.speeches[0].get_size();
111+
ov::Tensor cpu_tensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
128112
// copy results to release inference request
129-
gen_speech.speeches[0].copy_to(cpu_tensor);
113+
generatedSpeech.speeches[0].copy_to(cpu_tensor);
130114
lock.unlock();
131-
132-
auto waveform_ptr = cpu_tensor.data<const float>();
133-
OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr),
134-
"Failed to initialize WAV writer");
135-
drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr);
136-
OPENVINO_ASSERT(frames_written == total_samples, "Failed to write all frames");
115+
void* ppData;
116+
size_t pDataSize;
117+
prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpu_tensor);
137118
output = std::make_unique<std::string>(reinterpret_cast<char*>(ppData), pDataSize);
138-
drwav_uninit(&wav);
139-
timer.stop(OUTPUT_PREPARATION);
140-
auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
141-
SPDLOG_LOGGER_DEBUG(tts_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
142119
// drwav_free(ppData, NULL); TODO: is needed?
143120
} else {
144121
return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri));

0 commit comments

Comments
 (0)