@@ -37,10 +37,21 @@ struct whisper_params {
37
37
bool save_audio = false ; // save audio to wav file
38
38
bool use_gpu = true ;
39
39
bool flash_attn = false ;
40
+ bool no_prints = false ;
40
41
41
42
std::string language = " en" ;
42
43
std::string model = " models/ggml-base.en.bin" ;
43
44
std::string fname_out;
45
+
46
+ // Voice Activity Detection (VAD) parameters
47
+ bool vad = false ;
48
+ std::string vad_model = " models/for-tests-silero-v5.1.2-ggml.bin" ;
49
+ float vad_threshold = 0 .5f ;
50
+ int vad_min_speech_duration_ms = 250 ;
51
+ int vad_min_silence_duration_ms = 100 ;
52
+ float vad_max_speech_duration_s = FLT_MAX;
53
+ int vad_speech_pad_ms = 30 ;
54
+ float vad_samples_overlap = 0 .1f ;
44
55
};
45
56
46
57
void whisper_print_usage (int argc, char ** argv, const whisper_params & params);
@@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
61
72
else if (arg == " -mt" || arg == " --max-tokens" ) { params.max_tokens = std::stoi (argv[++i]); }
62
73
else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio_ctx = std::stoi (argv[++i]); }
63
74
else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
64
- else if (arg == " -vth" || arg == " --vad-thold" ) { params.vad_thold = std::stof (argv[++i]); }
65
- else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
66
75
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
67
76
else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
68
77
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
@@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
74
83
else if (arg == " -sa" || arg == " --save-audio" ) { params.save_audio = true ; }
75
84
else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
76
85
else if (arg == " -fa" || arg == " --flash-attn" ) { params.flash_attn = true ; }
77
-
86
+ else if (arg == " -np" || arg == " --no-prints" ) { params.no_prints = true ; }
87
+ // Voice Activity Detection (VAD)
88
+ else if ( arg == " --vad" ) { params.vad = true ; }
89
+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = argv[++i]; }
90
+ else if (arg == " -vt" || arg == " --vad-threshold" ) { params.vad_threshold = std::stof (argv[++i]); }
91
+ else if (arg == " -vsd" || arg == " --vad-min-speech-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
92
+ else if ( arg == " --vad-min-silence-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
93
+ else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
94
+ else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
95
+ else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
78
96
else {
79
97
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
80
98
whisper_print_usage (argc, argv, params);
@@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99
117
fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
100
118
fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
101
119
fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
102
- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
103
- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
104
120
fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
105
121
fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
106
122
fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
@@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112
128
fprintf (stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n " , params.save_audio ? " true" : " false" );
113
129
fprintf (stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n " , params.use_gpu ? " false" : " true" );
114
130
fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n " , params.flash_attn ? " true" : " false" );
131
+ fprintf (stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n " , params.no_prints ? " true" : " false" );
132
+ // Voice Activity Detection (VAD) parameters
133
+ fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
134
+ fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
135
+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n " , params.vad_model .c_str ());
136
+ fprintf (stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n " , params.vad_threshold );
137
+ fprintf (stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n " , params.vad_min_speech_duration_ms );
138
+ fprintf (stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n " , params.vad_min_silence_duration_ms );
139
+ fprintf (stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n " , params.vad_max_speech_duration_s == FLT_MAX ?
140
+ std::string (" FLT_MAX" ).c_str () :
141
+ std::to_string (params.vad_max_speech_duration_s ).c_str ());
142
+ fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
143
+ fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
115
144
fprintf (stderr, " \n " );
116
145
}
117
146
@@ -124,6 +153,17 @@ int main(int argc, char ** argv) {
124
153
return 1 ;
125
154
}
126
155
156
+ if (params.no_prints ) {
157
+ whisper_log_set ([](enum ggml_log_level, const char *, void *) { }, NULL );
158
+ }
159
+
160
+ if (params.vad ) {
161
+ // For VAD, ensure at least 500 of context
162
+ params.keep_ms = std::max (params.keep_ms , 500 );
163
+ } else {
164
+ params.keep_ms = std::min (params.keep_ms , params.step_ms );
165
+ }
166
+
127
167
params.keep_ms = std::min (params.keep_ms , params.step_ms );
128
168
params.length_ms = std::max (params.length_ms , params.step_ms );
129
169
@@ -132,7 +172,7 @@ int main(int argc, char ** argv) {
132
172
const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
133
173
const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
134
174
135
- const bool use_vad = n_samples_step <= 0 ; // sliding window mode uses VAD
175
+ const bool use_vad = params. vad ;
136
176
137
177
const int n_new_line = !use_vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
138
178
@@ -242,6 +282,30 @@ int main(int argc, char ** argv) {
242
282
break ;
243
283
}
244
284
285
+ whisper_full_params wparams = whisper_full_default_params (params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
286
+
287
+ wparams.print_progress = false ;
288
+ wparams.print_special = params.print_special ;
289
+ wparams.print_realtime = false ;
290
+ wparams.print_timestamps = !params.no_timestamps ;
291
+ wparams.translate = params.translate ;
292
+ wparams.single_segment = !use_vad;
293
+ wparams.max_tokens = params.max_tokens ;
294
+ wparams.language = params.language .c_str ();
295
+ wparams.n_threads = params.n_threads ;
296
+ wparams.beam_search .beam_size = params.beam_size ;
297
+
298
+ wparams.audio_ctx = params.audio_ctx ;
299
+
300
+ wparams.tdrz_enable = params.tinydiarize ; // [TDRZ]
301
+
302
+ // disable temperature fallback
303
+ // wparams.temperature_inc = -1.0f;
304
+ wparams.temperature_inc = params.no_fallback ? 0 .0f : wparams.temperature_inc ;
305
+
306
+ wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
307
+ wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
308
+
245
309
// process new audio
246
310
247
311
if (!use_vad) {
@@ -295,8 +359,26 @@ int main(int argc, char ** argv) {
295
359
296
360
audio.get (2000 , pcmf32_new);
297
361
298
- if (::vad_simple (pcmf32_new, WHISPER_SAMPLE_RATE, 1000 , params.vad_thold , params.freq_thold , false )) {
299
- audio.get (params.length_ms , pcmf32);
362
+ whisper_full_params wvparams = whisper_full_default_params (WHISPER_SAMPLING_GREEDY);
363
+ wvparams.vad = params.vad ;
364
+ wvparams.vad_model_path = params.vad_model .c_str ();
365
+ wvparams.vad_params .threshold = params.vad_threshold ;
366
+ wvparams.vad_params .min_speech_duration_ms = params.vad_min_speech_duration_ms ;
367
+ wvparams.vad_params .min_silence_duration_ms = params.vad_min_silence_duration_ms ;
368
+ wvparams.vad_params .max_speech_duration_s = params.vad_max_speech_duration_s ;
369
+ wvparams.vad_params .speech_pad_ms = params.vad_speech_pad_ms ;
370
+ wvparams.vad_params .samples_overlap = params.vad_samples_overlap ;
371
+
372
+ float * vad_samples;
373
+ int n_vad_samples;
374
+ if (whisper_vad (ctx, wvparams, pcmf32_new.data (), pcmf32_new.size (), &vad_samples, &n_vad_samples)) {
375
+ if (n_vad_samples == 0 ) {
376
+ std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
377
+ continue ;
378
+ }
379
+
380
+ pcmf32.assign (vad_samples, vad_samples + n_vad_samples);
381
+ free (vad_samples);
300
382
} else {
301
383
std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
302
384
@@ -308,30 +390,6 @@ int main(int argc, char ** argv) {
308
390
309
391
// run the inference
310
392
{
311
- whisper_full_params wparams = whisper_full_default_params (params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
312
-
313
- wparams.print_progress = false ;
314
- wparams.print_special = params.print_special ;
315
- wparams.print_realtime = false ;
316
- wparams.print_timestamps = !params.no_timestamps ;
317
- wparams.translate = params.translate ;
318
- wparams.single_segment = !use_vad;
319
- wparams.max_tokens = params.max_tokens ;
320
- wparams.language = params.language .c_str ();
321
- wparams.n_threads = params.n_threads ;
322
- wparams.beam_search .beam_size = params.beam_size ;
323
-
324
- wparams.audio_ctx = params.audio_ctx ;
325
-
326
- wparams.tdrz_enable = params.tinydiarize ; // [TDRZ]
327
-
328
- // disable temperature fallback
329
- // wparams.temperature_inc = -1.0f;
330
- wparams.temperature_inc = params.no_fallback ? 0 .0f : wparams.temperature_inc ;
331
-
332
- wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
333
- wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
334
-
335
393
if (whisper_full (ctx, wparams, pcmf32.data (), pcmf32.size ()) != 0 ) {
336
394
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
337
395
return 6 ;
0 commit comments