2424#pragma GCC diagnostic pop
2525#pragma warning(pop)
2626
27- #include " src/timer .hpp"
27+ #include " src/audio/audio_utils .hpp"
2828#include " src/http_payload.hpp"
2929#include " src/logging.hpp"
3030#include < mutex>
3535#include " absl/strings/escaping.h"
3636#include " absl/strings/str_cat.h"
3737#pragma warning(pop)
38- #define DR_WAV_IMPLEMENTATION
39- #define DR_MP3_IMPLEMENTATION
40- #include " src/port/dr_audio.hpp"
4138
4239#include " stt_servable.hpp"
4340
@@ -52,145 +49,6 @@ namespace mediapipe {
5249
5350const std::string STT_SESSION_SIDE_PACKET_TAG = " STT_NODE_RESOURCES" ;
5451
55- #define PIPELINE_SUPPORTED_SAMPLE_RATE 16000
56-
57- bool is_wav_buffer (const std::string buf) {
58- // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
59- // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
60- SPDLOG_TRACE (" is_wav_buffer: buf {}" , buf.substr (0 , 12 ));
61- if (buf.size () < 12 || buf.substr (0 , 4 ) != " RIFF" || buf.substr (8 , 4 ) != " WAVE" ) {
62- return false ;
63- }
64-
65- uint32_t chunk_size = *reinterpret_cast <const uint32_t *>(buf.data () + 4 );
66- SPDLOG_TRACE (" is_wav_buffer: chunk_size {}" , chunk_size);
67- if (chunk_size + 8 != buf.size ()) {
68- return false ;
69- }
70-
71- return true ;
72- }
73- // https://github.com/openvinotoolkit/openvino.genai/blob/8698683535fe32b5e3cb6953000c4e0175841bd3/samples/c/whisper_speech_recognition/whisper_utils.c#L105
74- float * resample_audio (const float * input,
75- size_t input_length,
76- float input_rate,
77- float target_rate,
78- size_t * output_length) {
79- SPDLOG_LOGGER_DEBUG (stt_calculator_logger, " Input file sample rate: {}. Resampling to {} required" , input_rate, target_rate);
80- float ratio = input_rate / target_rate;
81- *output_length = (size_t )(input_length / ratio);
82- float * output = (float *)malloc (*output_length * sizeof (float ));
83-
84- if (!output) {
85- return NULL ;
86- }
87-
88- for (size_t i = 0 ; i < *output_length; i++) {
89- float src_idx = i * ratio;
90- size_t idx0 = (size_t )src_idx;
91- size_t idx1 = idx0 + 1 ;
92-
93- if (idx1 >= input_length) {
94- output[i] = input[input_length - 1 ];
95- } else {
96- float frac = src_idx - idx0;
97- output[i] = input[idx0] * (1 .0f - frac) + input[idx1] * frac;
98- }
99- }
100-
101- return output;
102- }
103-
104- enum : unsigned int {
105- TENSOR_PREPARATION,
106- RESAMPLING,
107- TIMER_END
108- };
109-
110- ov::genai::RawSpeechInput read_wav (const std::string_view& wav_data) {
111- Timer<TIMER_END> timer;
112- timer.start (TENSOR_PREPARATION);
113- drwav wav;
114- auto result = drwav_init_memory (&wav, wav_data.data (), wav_data.size (), nullptr );
115- if (result == false ) {
116- throw std::runtime_error (" WAV file parsing failed" );
117- }
118- if (wav.channels != 1 && wav.channels != 2 ) {
119- drwav_uninit (&wav);
120- throw std::runtime_error (" WAV file must be mono or stereo" );
121- }
122-
123- const uint64_t n =
124- wav_data.empty () ? wav.totalPCMFrameCount : wav_data.size () / (wav.channels * wav.bitsPerSample / 8ul );
125-
126- std::vector<int16_t > pcm16;
127- pcm16.resize (n * wav.channels );
128- drwav_read_pcm_frames_s16 (&wav, n, pcm16.data ());
129- drwav_uninit (&wav);
130-
131- // convert to mono, float
132- std::vector<float > pcmf32;
133- pcmf32.resize (n);
134- if (wav.channels == 1 ) {
135- for (uint64_t i = 0 ; i < n; i++) {
136- pcmf32[i] = float (pcm16[i]) / 32768 .0f ;
137- }
138- } else {
139- for (uint64_t i = 0 ; i < n; i++) {
140- pcmf32[i] = float (pcm16[2 * i] + pcm16[2 * i + 1 ]) / 65536 .0f ;
141- }
142- }
143- timer.stop (TENSOR_PREPARATION);
144- auto tensorPreparationTime = (timer.elapsed <std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000 ;
145- SPDLOG_LOGGER_DEBUG (stt_calculator_logger, " Tensor preparation time: {} ms size: {}" , tensorPreparationTime, pcmf32.size ());
146- if (wav.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
147- return pcmf32;
148- }
149-
150- size_t output_length;
151- timer.start (RESAMPLING);
152- auto buffer = resample_audio (reinterpret_cast <float *>(pcmf32.data ()), pcmf32.size (), wav.sampleRate , PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
153- timer.stop (RESAMPLING);
154- auto resamplingTime = (timer.elapsed <std::chrono::microseconds>(RESAMPLING)) / 1000 ;
155- SPDLOG_LOGGER_DEBUG (stt_calculator_logger, " Resampling time: {} ms" , resamplingTime);
156- std::vector<float > output (buffer, buffer + output_length);
157- return output;
158- }
159-
160- ov::genai::RawSpeechInput read_mp3 (const std::string_view& mp3_data) {
161- Timer<TIMER_END> timer;
162- timer.start (TENSOR_PREPARATION);
163- drmp3 mp3;
164- auto result = drmp3_init_memory (&mp3, mp3_data.data (), mp3_data.size (), nullptr );
165- if (result == 0 ) {
166- throw std::runtime_error (" MP3 file parsing failed" );
167- }
168-
169- if (mp3.channels != 1 && mp3.channels != 2 ) {
170- drmp3_uninit (&mp3);
171- throw std::runtime_error (" MP3 file must be mono or stereo" );
172- }
173- const uint64_t n = mp3.totalPCMFrameCount ;
174- std::vector<float > pcmf32;
175- pcmf32.resize (n * mp3.channels );
176- drmp3_read_pcm_frames_f32 (&mp3, n, pcmf32.data ());
177- drmp3_uninit (&mp3);
178- timer.stop (TENSOR_PREPARATION);
179- auto tensorPreparationTime = (timer.elapsed <std::chrono::microseconds>(TENSOR_PREPARATION)) / 1000 ;
180- SPDLOG_LOGGER_DEBUG (stt_calculator_logger, " Tensor preparation time: {} ms size: {}" , tensorPreparationTime, pcmf32.size ());
181- if (mp3.sampleRate == PIPELINE_SUPPORTED_SAMPLE_RATE) {
182- return pcmf32;
183- }
184- timer.start (RESAMPLING);
185- size_t output_length;
186- auto buffer = resample_audio (reinterpret_cast <float *>(pcmf32.data ()), pcmf32.size (), mp3.sampleRate , PIPELINE_SUPPORTED_SAMPLE_RATE, &output_length);
187- timer.stop (RESAMPLING);
188- auto resamplingTime = (timer.elapsed <std::chrono::microseconds>(RESAMPLING)) / 1000 ;
189- SPDLOG_LOGGER_DEBUG (stt_calculator_logger, " Resampling time: {} ms" , resamplingTime);
190- std::vector<float > output (buffer, buffer + output_length);
191- return output;
192- }
193-
19452class SttCalculator : public CalculatorBase {
19553 static const std::string INPUT_TAG_NAME;
19654 static const std::string OUTPUT_TAG_NAME;
0 commit comments