FluidInference · BrandonWeng · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
diff --git a/samples/c/whisper_speech_recognition/CMakeLists.txt b/samples/c/whisper_speech_recognition/CMakeLists.txt
@@ -9,9 +9,9 @@ find_package(OpenVINOGenAI REQUIRED
 )
 
 # Whisper Speech Recognition Sample
-add_executable(whisper_speech_recognition_c whisper_speech_recognition.c)
+add_executable(whisper_speech_recognition_c whisper_speech_recognition.c whisper_utils.c)
 # Specifies that the source file should be compiled as a C source file
-set_source_files_properties(whisper_speech_recognition.c PROPERTIES LANGUAGE C)
+set_source_files_properties(whisper_speech_recognition.c whisper_utils.c PROPERTIES LANGUAGE C)
 target_link_libraries(whisper_speech_recognition_c PRIVATE openvino::genai::c m)
 set_target_properties(whisper_speech_recognition_c PROPERTIES
     # Ensure out-of-box LC_RPATH on macOS with SIP
@@ -21,4 +21,4 @@ set_target_properties(whisper_speech_recognition_c PROPERTIES
 install(TARGETS whisper_speech_recognition_c
         RUNTIME DESTINATION samples_bin/c/whisper_speech_recognition
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/c/whisper_speech_recognition/whisper_speech_recognition.c b/samples/c/whisper_speech_recognition/whisper_speech_recognition.c
@@ -4,317 +4,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include <time.h>
-#include <errno.h>
 
 #include "openvino/genai/c/whisper_pipeline.h"
-
-#define MAX_PATH_LENGTH 1024
-#define CHECK_STATUS(return_status)                                                      \
-    if (return_status != OK) {                                                           \
-        const char* error_msg = "Unknown error";                                         \
-        switch(return_status) {                                                          \
-            case INVALID_C_PARAM: error_msg = "Invalid parameter"; break;                \
-            case NOT_FOUND: error_msg = "Not found"; break;                             \
-            case OUT_OF_BOUNDS: error_msg = "Out of bounds"; break;                     \
-            case UNEXPECTED: error_msg = "Unexpected error"; break;                     \
-            case NOT_IMPLEMENTED: error_msg = "Not implemented"; break;                  \
-            case UNKNOW_EXCEPTION: error_msg = "Unknown exception"; break;              \
-        }                                                                                \
-        fprintf(stderr, "[ERROR] %s (status code: %d) at line %d\n",                    \
-                error_msg, return_status, __LINE__);                                     \
-        exit_code = EXIT_FAILURE;                                                       \
-        goto err;                                                                        \
-    }
-
-// Default values
-#define DEFAULT_DEVICE         "CPU"
-#define DEFAULT_LANGUAGE       ""
-#define DEFAULT_TASK           "transcribe"
-#define DEFAULT_SAMPLE_RATE    16000.0f
-#define DEFAULT_DURATION       2.0f
-
-typedef struct {
-    const char* model_path;
-    const char* audio_path;
-    const char* device;
-    const char* language;
-    const char* task;
-    const char* initial_prompt;
-    bool return_timestamps;
-    bool use_synthetic_audio;
-    float sample_rate;
-    float duration;
-} Options;
-
-void print_usage(const char* program_name) {
-    printf("Usage: %s [OPTIONS]\n", program_name);
-    printf("\nRequired:\n");
-    printf("  -m, --model            Path to Whisper model directory\n");
-    printf("\nOptional:\n");
-    printf("  -i, --input            Path to audio file (WAV format). If not specified, uses synthetic audio\n");
-    printf("  -d, --device           Device to run inference on (default: %s)\n", DEFAULT_DEVICE);
-    printf("  -l, --language         Language code (e.g., 'en', 'fr', 'de'). Empty for auto-detect (default: auto-detect)\n");
-    printf("  -t, --task             Task: 'transcribe' or 'translate' (default: %s)\n", DEFAULT_TASK);
-    printf("  --initial_prompt       Initial prompt to guide transcription\n");
-    printf("  --timestamps           Return timestamps for each segment\n");
-    printf("  -h, --help             Print this help message\n");
-    printf("\nSynthetic audio options (when no input file specified):\n");
-    printf("  --duration             Duration of synthetic audio in seconds (default: %.1f)\n", DEFAULT_DURATION);
-    printf("\nExamples:\n");
-    printf("  # Transcribe an audio file\n");
-    printf("  %s -m /path/to/whisper/model -i audio.wav\n", program_name);
-    printf("\n  # Translate French audio to English\n");
-    printf("  %s -m /path/to/whisper/model -i french_audio.wav -l fr -t translate\n", program_name);
-    printf("\n  # Use synthetic audio\n");
-    printf("  %s -m /path/to/whisper/model\n", program_name);
-}
-
-int parse_arguments(int argc, char* argv[], Options* options) {
-    // Initialize with defaults
-    options->model_path = NULL;
-    options->audio_path = NULL;
-    options->device = DEFAULT_DEVICE;
-    options->language = DEFAULT_LANGUAGE;
-    options->task = DEFAULT_TASK;
-    options->initial_prompt = NULL;
-    options->return_timestamps = false;
-    options->use_synthetic_audio = true;
-    options->sample_rate = DEFAULT_SAMPLE_RATE;
-    options->duration = DEFAULT_DURATION;
-
-    for (int i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "-m") == 0 || strcmp(argv[i], "--model") == 0) {
-            if (i + 1 < argc) {
-                options->model_path = argv[++i];
-            } else {
-                fprintf(stderr, "Error: --model requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "-i") == 0 || strcmp(argv[i], "--input") == 0) {
-            if (i + 1 < argc) {
-                options->audio_path = argv[++i];
-                options->use_synthetic_audio = false;
-            } else {
-                fprintf(stderr, "Error: --input requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "-d") == 0 || strcmp(argv[i], "--device") == 0) {
-            if (i + 1 < argc) {
-                options->device = argv[++i];
-            } else {
-                fprintf(stderr, "Error: --device requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "-l") == 0 || strcmp(argv[i], "--language") == 0) {
-            if (i + 1 < argc) {
-                options->language = argv[++i];
-            } else {
-                fprintf(stderr, "Error: --language requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--task") == 0) {
-            if (i + 1 < argc) {
-                options->task = argv[++i];
-                if (strcmp(options->task, "transcribe") != 0 && strcmp(options->task, "translate") != 0) {
-                    fprintf(stderr, "Error: --task must be 'transcribe' or 'translate'\n");
-                    return -1;
-                }
-            } else {
-                fprintf(stderr, "Error: --task requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "--initial_prompt") == 0) {
-            if (i + 1 < argc) {
-                options->initial_prompt = argv[++i];
-            } else {
-                fprintf(stderr, "Error: --initial_prompt requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "--timestamps") == 0) {
-            options->return_timestamps = true;
-        } else if (strcmp(argv[i], "--duration") == 0) {
-            if (i + 1 < argc) {
-                options->duration = (float)atof(argv[++i]);
-            } else {
-                fprintf(stderr, "Error: --duration requires an argument\n");
-                return -1;
-            }
-        } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
-            print_usage(argv[0]);
-            return 0;
-        } else {
-            fprintf(stderr, "Error: Unknown option %s\n", argv[i]);
-            fprintf(stderr, "Use -h or --help for usage information\n");
-            return -1;
-        }
-    }
-
-    // Validate required arguments
-    if (options->model_path == NULL) {
-        fprintf(stderr, "Error: Model path is required. Use -m or --model option\n");
-        fprintf(stderr, "Use -h or --help for usage information\n");
-        return -1;
-    }
-
-    return 1;
-}
-
-// Simple WAV file header structure
-typedef struct {
-    char chunk_id[4];
-    uint32_t chunk_size;
-    char format[4];
-    char subchunk1_id[4];
-    uint32_t subchunk1_size;
-    uint16_t audio_format;
-    uint16_t num_channels;
-    uint32_t sample_rate;
-    uint32_t byte_rate;
-    uint16_t block_align;
-    uint16_t bits_per_sample;
-    char subchunk2_id[4];
-    uint32_t subchunk2_size;
-} WAVHeader;
-
-// Load audio from WAV file
-int load_wav_file(const char* filename, float** audio_data, size_t* audio_length, float* sample_rate) {
-    FILE* file = fopen(filename, "rb");
-    if (!file) {
-        fprintf(stderr, "Error: Cannot open audio file '%s'. ", filename);
-        if (errno == ENOENT) {
-            fprintf(stderr, "File does not exist.\n");
-        } else if (errno == EACCES) {
-            fprintf(stderr, "Permission denied.\n");
-        } else {
-            fprintf(stderr, "Error code: %d\n", errno);
-        }
-        return -1;
-    }
-
-    WAVHeader header;
-    if (fread(&header, sizeof(WAVHeader), 1, file) != 1) {
-        fprintf(stderr, "Error: Cannot read WAV header\n");
-        fclose(file);
-        return -1;
-    }
-
-    // Basic WAV validation
-    if (strncmp(header.chunk_id, "RIFF", 4) != 0 || strncmp(header.format, "WAVE", 4) != 0) {
-        fprintf(stderr, "Error: Invalid WAV file format\n");
-        fclose(file);
-        return -1;
-    }
-
-    if (header.audio_format != 1) { // PCM
-        fprintf(stderr, "Error: Only PCM WAV files are supported\n");
-        fclose(file);
-        return -1;
-    }
-
-    if (header.num_channels != 1) {
-        fprintf(stderr, "Error: Only mono audio is supported (found %d channels)\n", header.num_channels);
-        fclose(file);
-        return -1;
-    }
-
-    *sample_rate = (float)header.sample_rate;
-    size_t num_samples = header.subchunk2_size / (header.bits_per_sample / 8);
-    *audio_length = num_samples;
-
-    // Allocate memory for audio data
-    *audio_data = (float*)malloc(num_samples * sizeof(float));
-    if (!*audio_data) {
-        fprintf(stderr, "Error: Cannot allocate memory for audio data\n");
-        fclose(file);
-        return -1;
-    }
-
-    // Read and convert audio data to float
-    if (header.bits_per_sample == 16) {
-        int16_t* temp_buffer = (int16_t*)malloc(num_samples * sizeof(int16_t));
-        if (!temp_buffer) {
-            fprintf(stderr, "Error: Cannot allocate temporary buffer\n");
-            free(*audio_data);
-            fclose(file);
-            return -1;
-        }
-
-        if (fread(temp_buffer, sizeof(int16_t), num_samples, file) != num_samples) {
-            fprintf(stderr, "Error: Cannot read audio data\n");
-            free(temp_buffer);
-            free(*audio_data);
-            fclose(file);
-            return -1;
-        }
-
-        // Convert 16-bit PCM to float [-1, 1]
-        for (size_t i = 0; i < num_samples; i++) {
-            (*audio_data)[i] = temp_buffer[i] / 32768.0f;
-        }
-
-        free(temp_buffer);
-    } else if (header.bits_per_sample == 32) {
-        if (fread(*audio_data, sizeof(float), num_samples, file) != num_samples) {
-            fprintf(stderr, "Error: Cannot read audio data\n");
-            free(*audio_data);
-            fclose(file);
-            return -1;
-        }
-    } else {
-        fprintf(stderr, "Error: Unsupported bit depth: %d\n", header.bits_per_sample);
-        free(*audio_data);
-        fclose(file);
-        return -1;
-    }
-
-    fclose(file);
-    return 0;
-}
-
-// Generate synthetic audio (sine wave)
-void generate_synthetic_audio(float* audio, size_t length, float frequency, float sample_rate) {
-    for (size_t i = 0; i < length; i++) {
-        audio[i] = 0.5f * sinf(2.0f * M_PI * frequency * (float)i / sample_rate);
-    }
-}
-
-// Resample audio to 16kHz if needed (simple linear interpolation)
-float* resample_audio(const float* input, size_t input_length, float input_rate, float target_rate, size_t* output_length) {
-    if (input_rate == target_rate) {
-        *output_length = input_length;
-        float* output = (float*)malloc(input_length * sizeof(float));
-        if (output) {
-            memcpy(output, input, input_length * sizeof(float));
-        }
-        return output;
-    }
-
-    float ratio = input_rate / target_rate;
-    *output_length = (size_t)(input_length / ratio);
-    float* output = (float*)malloc(*output_length * sizeof(float));
-
-    if (!output) {
-        return NULL;
-    }
-
-    for (size_t i = 0; i < *output_length; i++) {
-        float src_idx = i * ratio;
-        size_t idx0 = (size_t)src_idx;
-        size_t idx1 = idx0 + 1;
-
-        if (idx1 >= input_length) {
-            output[i] = input[input_length - 1];
-        } else {
-            float frac = src_idx - idx0;
-            output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac;
-        }
-    }
-
-    return output;
-}
-
+#include "whisper_utils.h"
 
 int main(int argc, char* argv[]) {
     Options options;