@@ -104,6 +104,7 @@ struct whisper_params {
104
104
bool flash_attn = false ;
105
105
bool suppress_nst = false ;
106
106
bool no_context = false ;
107
+ bool no_language_probabilities = false ;
107
108
108
109
std::string language = " en" ;
109
110
std::string prompt = " " ;
@@ -178,6 +179,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
178
179
fprintf (stderr, " -nc, --no-context [%-7s] do not use previous audio context\n " , params.no_context ? " true" : " false" );
179
180
fprintf (stderr, " -ng, --no-gpu [%-7s] do not use gpu\n " , params.use_gpu ? " false" : " true" );
180
181
fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention\n " , params.flash_attn ? " true" : " false" );
182
+ fprintf (stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n " , params.no_language_probabilities ? " true" : " false" );
181
183
// Voice Activity Detection (VAD) parameters
182
184
fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
183
185
fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
@@ -237,6 +239,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
237
239
else if (arg == " -sns" || arg == " --suppress-nst" ) { params.suppress_nst = true ; }
238
240
else if (arg == " -nth" || arg == " --no-speech-thold" ) { params.no_speech_thold = std::stof (argv[++i]); }
239
241
else if (arg == " -nc" || arg == " --no-context" ) { params.no_context = true ; }
242
+ else if (arg == " -nlp" || arg == " --no-language-probabilities" ) { params.no_language_probabilities = true ; }
240
243
241
244
// server params
242
245
else if ( arg == " --port" ) { sparams.port = std::stoi (argv[++i]); }
@@ -599,6 +602,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
599
602
{
600
603
params.vad_samples_overlap = std::stof (req.get_file_value (" vad_samples_overlap" ).content );
601
604
}
605
+ if (req.has_file (" no_language_probabilities" ))
606
+ {
607
+ params.no_language_probabilities = parse_str_to_bool (req.get_file_value (" no_language_probabilities" ).content );
608
+ }
602
609
}
603
610
604
611
} // namespace
@@ -1024,23 +1031,25 @@ int main(int argc, char ** argv) {
1024
1031
} else if (params.response_format == vjson_format) {
1025
1032
/* try to match openai/whisper's Python format */
1026
1033
std::string results = output_str (ctx, params, pcmf32s);
1027
- // Get language probabilities
1028
- std::vector<float > lang_probs (whisper_lang_max_id () + 1 , 0 .0f );
1029
- const auto detected_lang_id = whisper_lang_auto_detect (ctx, 0 , params.n_threads , lang_probs.data ());
1030
1034
json jres = json{
1031
1035
{" task" , params.translate ? " translate" : " transcribe" },
1032
1036
{" language" , whisper_lang_str_full (whisper_full_lang_id (ctx))},
1033
1037
{" duration" , float (pcmf32.size ())/WHISPER_SAMPLE_RATE},
1034
1038
{" text" , results},
1035
- {" segments" , json::array ()},
1036
- {" detected_language" , whisper_lang_str_full (detected_lang_id)},
1037
- {" detected_language_probability" , lang_probs[detected_lang_id]},
1038
- {" language_probabilities" , json::object ()}
1039
+ {" segments" , json::array ()}
1039
1040
};
1040
- // Add all language probabilities
1041
- for (int i = 0 ; i <= whisper_lang_max_id (); ++i) {
1042
- if (lang_probs[i] > 0 .001f ) { // Only include non-negligible probabilities
1043
- jres[" language_probabilities" ][whisper_lang_str (i)] = lang_probs[i];
1041
+ // Only compute language probabilities if requested (expensive operation)
1042
+ if (!params.no_language_probabilities ) {
1043
+ std::vector<float > lang_probs (whisper_lang_max_id () + 1 , 0 .0f );
1044
+ const auto detected_lang_id = whisper_lang_auto_detect (ctx, 0 , params.n_threads , lang_probs.data ());
1045
+ jres[" detected_language" ] = whisper_lang_str_full (detected_lang_id);
1046
+ jres[" detected_language_probability" ] = lang_probs[detected_lang_id];
1047
+ jres[" language_probabilities" ] = json::object ();
1048
+ // Add all language probabilities
1049
+ for (int i = 0 ; i <= whisper_lang_max_id (); ++i) {
1050
+ if (lang_probs[i] > 0 .001f ) { // Only include non-negligible probabilities
1051
+ jres[" language_probabilities" ][whisper_lang_str (i)] = lang_probs[i];
1052
+ }
1044
1053
}
1045
1054
}
1046
1055
const int n_segments = whisper_full_n_segments (ctx);
0 commit comments