style

michalkulakowski · michalkulakowski · commit 4b0697233220 · 2025-10-24T16:18:36.000+02:00
diff --git a/src/BUILD b/src/BUILD
@@ -560,6 +560,7 @@ ovms_cc_library(
                 "//src/image_gen:image_gen_calculator",
                 "//src/audio/speech_to_text:stt_calculator",
                 "//src/audio/text_to_speech:tts_calculator",
+                "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",
                 "//src/embeddings:embeddingscalculator",
diff --git a/src/audio/speech_to_text/BUILD b/src/audio/speech_to_text/BUILD
@@ -35,7 +35,7 @@ ovms_cc_library(
         "//src/port:dr_audio",
         ":stt_servable",
         "//third_party:genai",
-        "//src:libovmstimer",
+        "//src/audio:audio_utils",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/src/audio/speech_to_text/stt_calculator.cc b/src/audio/speech_to_text/stt_calculator.cc
@@ -24,7 +24,7 @@
 #pragma GCC diagnostic pop
 #pragma warning(pop)
 
-#include "src/timer.hpp"
+#include "src/audio/audio_utils.hpp"
 #include "src/http_payload.hpp"
 #include "src/logging.hpp"
 #include <mutex>
@@ -35,9 +35,6 @@
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #pragma warning(pop)
-#define DR_WAV_IMPLEMENTATION
-#define DR_MP3_IMPLEMENTATION
-#include "src/port/dr_audio.hpp"
 
 #include "stt_servable.hpp"
 
@@ -52,145 +49,6 @@ namespace mediapipe {
 
 const std::string STT_SESSION_SIDE_PACKET_TAG = "STT_NODE_RESOURCES";
 
-#define PIPELINE_SUPPORTED_SAMPLE_RATE 16000
-
-bool is_wav_buffer(const std::string buf) {
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    SPDLOG_TRACE("is_wav_buffer: buf {}", buf.substr(0, 12));
-    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
-        return false;
-    }
-
-    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
-    SPDLOG_TRACE("is_wav_buffer: chunk_size {}", chunk_size);
-    if (chunk_size + 8 != buf.size()) {
-        return false;
-    }
-
-    return true;
-}
-// https://github.com/openvinotoolkit/openvino.genai/blob/8698683535fe32b5e3cb6953000c4e0175841bd3/samples/c/whisper_speech_recognition/whisper_utils.c#L105
-float* resample_audio(const float* input,
-    size_t input_length,
-    float input_rate,
-    float target_rate,
-    size_t* output_length) {
-    SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Input file sample rate: {}. Resampling to {} required", input_rate, target_rate);
-    float ratio = input_rate / target_rate;
-    *output_length = (size_t)(input_length / ratio);
-    float* output = (float*)malloc(*output_length * sizeof(float));
-
-    if (!output) {
-        return NULL;
-    }
-
-    for (size_t i = 0; i < *output_length; i++) {
-        float src_idx = i * ratio;
-        size_t idx0 = (size_t)src_idx;
-        size_t idx1 = idx0 + 1;
-
-        if (idx1 >= input_length) {
-            output[i] = input[input_length - 1];
-        } else {
-            float frac = src_idx - idx0;
-            output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac;
-        }
-    }
-
-    return output;
-}
-
-enum : unsigned int {
-    TENSOR_PREPARATION,
-    RESAMPLING,
-    TIMER_END
-};
-
-ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) {
-    Timer<TIMER_END> timer;
-    timer.start(TENSOR_PREPARATION);
-    drwav wav;
-    auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr);
-    if (result == false) {
-        throw std::runtime_error("WAV file parsing failed");
-    }
-    if (wav.channels != 1 && wav.channels != 2) {
-        drwav_uninit(&wav);
-        throw std::runtime_error("WAV file must be mono or stereo");
-    }
-
-    const uint64_t n =
-        wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8ul);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n * wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    std::vector<float> pcmf32;
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i]) / 32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
-        }
-    }
-    timer.stop(TENSOR_PREPARATION);
-    auto tensorPreparationTime = (timer.elapsed<std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000;
-    SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Tensor preparation time: {} ms size: {}", tensorPreparationTime, pcmf32.size());
-    if (wav.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
-        return pcmf32;
-    }
-
-    size_t output_length;
-    timer.start(RESAMPLING);
-    auto buffer = resample_audio(reinterpret_cast<float*>(pcmf32.data()), pcmf32.size(), wav.sampleRate, PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
-    timer.stop(RESAMPLING);
-    auto resamplingTime = (timer.elapsed<std::chrono::microseconds>(RESAMPLING)) / 1000;
-    SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Resampling time: {} ms", resamplingTime);
-    std::vector<float> output(buffer, buffer + output_length);
-    return output;
-}
-
-ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) {
-    Timer<TIMER_END> timer;
-    timer.start(TENSOR_PREPARATION);
-    drmp3 mp3;
-    auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr);
-    if (result == 0) {
-        throw std::runtime_error("MP3 file parsing failed");
-    }
-
-    if (mp3.channels != 1 && mp3.channels != 2) {
-        drmp3_uninit(&mp3);
-        throw std::runtime_error("MP3 file must be mono or stereo");
-    }
-    const uint64_t n = mp3.totalPCMFrameCount;
-    std::vector<float> pcmf32;
-    pcmf32.resize(n * mp3.channels);
-    drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data());
-    drmp3_uninit(&mp3);
-    timer.stop(TENSOR_PREPARATION);
-    auto tensorPreparationTime = (timer.elapsed<std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000;
-    SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Tensor preparation time: {} ms size: {}", tensorPreparationTime, pcmf32.size());
-    if (mp3.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
-        return pcmf32;
-    }
-    timer.start(RESAMPLING);
-    size_t output_length;
-    auto buffer = resample_audio(reinterpret_cast<float*>(pcmf32.data()), pcmf32.size(), mp3.sampleRate, PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
-    timer.stop(RESAMPLING);
-    auto resamplingTime = (timer.elapsed<std::chrono::microseconds>(RESAMPLING)) / 1000;
-    SPDLOG_LOGGER_DEBUG(stt_calculator_logger, "Resampling time: {} ms", resamplingTime);
-    std::vector<float> output(buffer, buffer + output_length);
-    return output;
-}
-
 class SttCalculator : public CalculatorBase {
     static const std::string INPUT_TAG_NAME;
     static const std::string OUTPUT_TAG_NAME;
diff --git a/src/audio/text_to_speech/BUILD b/src/audio/text_to_speech/BUILD
@@ -35,7 +35,7 @@ ovms_cc_library(
         "//src/port:dr_audio",
         ":tts_servable",
         "//third_party:genai",
-         "//src:libovmstimer",
+        "//src/audio:audio_utils",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/src/audio/text_to_speech/tts_calculator.cc b/src/audio/text_to_speech/tts_calculator.cc
@@ -24,7 +24,7 @@
 #pragma GCC diagnostic pop
 #pragma warning(pop)
 
-#include "src/timer.hpp"
+#include "src/audio/audio_utils.hpp"
 #include "src/http_payload.hpp"
 #include "src/logging.hpp"
 #include <mutex>
@@ -105,40 +105,17 @@ class TtsCalculator : public CalculatorBase {
                 return absl::InvalidArgumentError("streaming is not supported");
             }
             std::unique_lock lock(pipe->ttsPipelineMutex);
-            auto gen_speech = pipe->ttsPipeline->generate(inputIt->value.GetString());
-
-            enum : unsigned int {
-                OUTPUT_PREPARATION,
-                TIMER_END
-            };
-            Timer<TIMER_END> timer;
-            timer.start(OUTPUT_PREPARATION);
-            drwav_data_format format;
-            format.container = drwav_container_riff;
-            format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
-            format.channels = 1;
-            format.sampleRate = 16000;  // assume it is always 16 KHz
-            format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth();
-            drwav wav;
-            void* ppData;
-            size_t pDataSize;
-            auto waveform_size = gen_speech.speeches[0].get_size();
-            size_t total_samples = waveform_size * format.channels;
-            ov::Tensor cpu_tensor(gen_speech.speeches[0].get_element_type(), gen_speech.speeches[0].get_shape());
+            auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+            auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
+            auto speechSize = generatedSpeech.speeches[0].get_size();
+            ov::Tensor cpu_tensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
             // copy results to release inference request
-            gen_speech.speeches[0].copy_to(cpu_tensor);
+            generatedSpeech.speeches[0].copy_to(cpu_tensor);
             lock.unlock();
-
-            auto waveform_ptr = cpu_tensor.data<const float>();
-            OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr),
-                "Failed to initialize WAV writer");
-            drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr);
-            OPENVINO_ASSERT(frames_written == total_samples, "Failed to write all frames");
+            void* ppData;
+            size_t pDataSize;
+            prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpu_tensor);
             output = std::make_unique<std::string>(reinterpret_cast<char*>(ppData), pDataSize);
-            drwav_uninit(&wav);
-            timer.stop(OUTPUT_PREPARATION);
-            auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
-            SPDLOG_LOGGER_DEBUG(tts_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
             // drwav_free(ppData, NULL); TODO: is needed?
         } else {
             return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri));