@@ -25,9 +25,6 @@ struct whisper_params {
25
25
int32_t audio_ctx = 0 ;
26
26
int32_t beam_size = -1 ;
27
27
28
- float vad_thold = 0 .6f ;
29
- float freq_thold = 100 .0f ;
30
-
31
28
bool translate = false ;
32
29
bool no_fallback = false ;
33
30
bool print_special = false ;
@@ -37,10 +34,21 @@ struct whisper_params {
37
34
bool save_audio = false ; // save audio to wav file
38
35
bool use_gpu = true ;
39
36
bool flash_attn = false ;
37
+ bool no_prints = false ;
40
38
41
39
std::string language = " en" ;
42
40
std::string model = " models/ggml-base.en.bin" ;
43
41
std::string fname_out;
42
+
43
+ // Voice Activity Detection (VAD) parameters
44
+ bool vad = false ;
45
+ std::string vad_model = " models/for-tests-silero-v5.1.2-ggml.bin" ;
46
+ float vad_threshold = 0 .5f ;
47
+ int vad_min_speech_duration_ms = 250 ;
48
+ int vad_min_silence_duration_ms = 100 ;
49
+ float vad_max_speech_duration_s = FLT_MAX;
50
+ int vad_speech_pad_ms = 30 ;
51
+ float vad_samples_overlap = 0 .1f ;
44
52
};
45
53
46
54
void whisper_print_usage (int argc, char ** argv, const whisper_params & params);
@@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
61
69
else if (arg == " -mt" || arg == " --max-tokens" ) { params.max_tokens = std::stoi (argv[++i]); }
62
70
else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio_ctx = std::stoi (argv[++i]); }
63
71
else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
64
- else if (arg == " -vth" || arg == " --vad-thold" ) { params.vad_thold = std::stof (argv[++i]); }
65
- else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
66
72
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
67
73
else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
68
74
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
@@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
74
80
else if (arg == " -sa" || arg == " --save-audio" ) { params.save_audio = true ; }
75
81
else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
76
82
else if (arg == " -fa" || arg == " --flash-attn" ) { params.flash_attn = true ; }
77
-
83
+ else if (arg == " -np" || arg == " --no-prints" ) { params.no_prints = true ; }
84
+ // Voice Activity Detection (VAD)
85
+ else if ( arg == " --vad" ) { params.vad = true ; }
86
+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = argv[++i]; }
87
+ else if (arg == " -vt" || arg == " --vad-threshold" ) { params.vad_threshold = std::stof (argv[++i]); }
88
+ else if (arg == " -vsd" || arg == " --vad-min-speech-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
89
+ else if (arg == " -vsd" || arg == " --vad-min-silence-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
90
+ else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
91
+ else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
92
+ else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
78
93
else {
79
94
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
80
95
whisper_print_usage (argc, argv, params);
@@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99
114
fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
100
115
fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
101
116
fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
102
- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
103
- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
104
117
fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
105
118
fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
106
119
fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
@@ -112,6 +125,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112
125
fprintf (stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n " , params.save_audio ? " true" : " false" );
113
126
fprintf (stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n " , params.use_gpu ? " false" : " true" );
114
127
fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n " , params.flash_attn ? " true" : " false" );
128
+ fprintf (stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n " , params.no_prints ? " true" : " false" );
129
+ // Voice Activity Detection (VAD) parameters
130
+ fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
131
+ fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
132
+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n " , params.vad_model .c_str ());
133
+ fprintf (stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n " , params.vad_threshold );
134
+ fprintf (stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n " , params.vad_min_speech_duration_ms );
135
+ fprintf (stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n " , params.vad_min_silence_duration_ms );
136
+ fprintf (stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n " , params.vad_max_speech_duration_s == FLT_MAX ?
137
+ std::string (" FLT_MAX" ).c_str () :
138
+ std::to_string (params.vad_max_speech_duration_s ).c_str ());
139
+ fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
140
+ fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
115
141
fprintf (stderr, " \n " );
116
142
}
117
143
@@ -122,20 +148,22 @@ int main(int argc, char ** argv) {
122
148
return 1 ;
123
149
}
124
150
151
+ if (params.no_prints ) {
152
+ whisper_log_set ([](enum ggml_log_level, const char *, void *) { }, NULL );
153
+ }
154
+
125
155
params.keep_ms = std::min (params.keep_ms , params.step_ms );
126
156
params.length_ms = std::max (params.length_ms , params.step_ms );
127
157
158
+
128
159
const int n_samples_step = (1e-3 *params.step_ms )*WHISPER_SAMPLE_RATE;
129
160
const int n_samples_len = (1e-3 *params.length_ms )*WHISPER_SAMPLE_RATE;
130
161
const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
131
162
const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
132
163
133
- const bool use_vad = n_samples_step <= 0 ; // sliding window mode uses VAD
134
-
135
- const int n_new_line = !use_vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
136
-
137
- params.no_timestamps = !use_vad;
138
- params.no_context |= use_vad;
164
+ const int n_new_line = !params.vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
165
+ params.no_timestamps = !params.vad ;
166
+ params.no_context |= params.vad ;
139
167
params.max_tokens = 0 ;
140
168
141
169
// init audio
@@ -189,7 +217,7 @@ int main(int argc, char ** argv) {
189
217
params.translate ? " translate" : " transcribe" ,
190
218
params.no_timestamps ? 0 : 1 );
191
219
192
- if (!use_vad ) {
220
+ if (!params. vad ) {
193
221
fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
194
222
} else {
195
223
fprintf (stderr, " %s: using VAD, will transcribe on speech activity\n " , __func__);
@@ -242,7 +270,7 @@ int main(int argc, char ** argv) {
242
270
243
271
// process new audio
244
272
245
- if (!use_vad ) {
273
+ if (!params. vad ) {
246
274
while (true ) {
247
275
// handle Ctrl + C
248
276
is_running = sdl_poll_events ();
@@ -270,7 +298,7 @@ int main(int argc, char ** argv) {
270
298
// take up to params.length_ms audio from previous iteration
271
299
const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
272
300
273
- // printf( "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
301
+ // fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
274
302
275
303
pcmf32.resize (n_samples_new + n_samples_take);
276
304
@@ -285,22 +313,29 @@ int main(int argc, char ** argv) {
285
313
const auto t_now = std::chrono::high_resolution_clock::now ();
286
314
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count ();
287
315
288
- if (t_diff < 2000 ) {
289
- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
290
-
316
+ if (t_diff < params.step_ms ) {
317
+ std::this_thread::sleep_for (std::chrono::milliseconds (params.step_ms ));
291
318
continue ;
292
319
}
293
320
294
- audio.get (2000 , pcmf32_new);
321
+ // Get new audio for this step
322
+ audio.get (params.step_ms , pcmf32_new);
295
323
296
- if (::vad_simple (pcmf32_new, WHISPER_SAMPLE_RATE, 1000 , params.vad_thold , params.freq_thold , false )) {
297
- audio.get (params.length_ms , pcmf32);
298
- } else {
299
- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
324
+ // Calculate how much old audio to keep
325
+ const int n_samples_new = pcmf32_new.size ();
326
+ const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
300
327
301
- continue ;
328
+ // Combine old + new audio with overlap
329
+ pcmf32.resize (n_samples_new + n_samples_take);
330
+
331
+ // Copy kept portion from previous iteration
332
+ for (int i = 0 ; i < n_samples_take; i++) {
333
+ pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
302
334
}
303
335
336
+ // Append new audio
337
+ memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new * sizeof (float ));
338
+
304
339
t_last = t_now;
305
340
}
306
341
@@ -313,7 +348,6 @@ int main(int argc, char ** argv) {
313
348
wparams.print_realtime = false ;
314
349
wparams.print_timestamps = !params.no_timestamps ;
315
350
wparams.translate = params.translate ;
316
- wparams.single_segment = !use_vad;
317
351
wparams.max_tokens = params.max_tokens ;
318
352
wparams.language = params.language .c_str ();
319
353
wparams.n_threads = params.n_threads ;
@@ -330,27 +364,30 @@ int main(int argc, char ** argv) {
330
364
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
331
365
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
332
366
367
+ wparams.vad = params.vad ;
368
+ wparams.vad_model_path = params.vad_model .c_str ();
369
+
370
+ wparams.vad_params .threshold = params.vad_threshold ;
371
+ wparams.vad_params .min_speech_duration_ms = params.vad_min_speech_duration_ms ;
372
+ wparams.vad_params .min_silence_duration_ms = params.vad_min_silence_duration_ms ;
373
+ wparams.vad_params .max_speech_duration_s = params.vad_max_speech_duration_s ;
374
+ wparams.vad_params .speech_pad_ms = params.vad_speech_pad_ms ;
375
+ wparams.vad_params .samples_overlap = params.vad_samples_overlap ;
376
+
333
377
if (whisper_full (ctx, wparams, pcmf32.data (), pcmf32.size ()) != 0 ) {
334
378
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
335
379
return 6 ;
336
380
}
337
381
338
382
// print result;
339
383
{
340
- if (!use_vad ) {
384
+ if (!params. vad ) {
341
385
printf (" \33 [2K\r " );
342
386
343
387
// print long empty line to clear the previous line
344
388
printf (" %s" , std::string (100 , ' ' ).c_str ());
345
389
346
390
printf (" \33 [2K\r " );
347
- } else {
348
- const int64_t t1 = (t_last - t_start).count ()/1000000 ;
349
- const int64_t t0 = std::max (0.0 , t1 - pcmf32.size ()*1000.0 /WHISPER_SAMPLE_RATE);
350
-
351
- printf (" \n " );
352
- printf (" ### Transcription %d START | t0 = %d ms | t1 = %d ms\n " , n_iter, (int ) t0, (int ) t1);
353
- printf (" \n " );
354
391
}
355
392
356
393
const int n_segments = whisper_full_n_segments (ctx);
@@ -389,15 +426,11 @@ int main(int argc, char ** argv) {
389
426
fout << std::endl;
390
427
}
391
428
392
- if (use_vad) {
393
- printf (" \n " );
394
- printf (" ### Transcription %d END\n " , n_iter);
395
- }
396
429
}
397
430
398
431
++n_iter;
399
432
400
- if (!use_vad && (n_iter % n_new_line) == 0 ) {
433
+ if (!params. vad && (n_iter % n_new_line) == 0 ) {
401
434
printf (" \n " );
402
435
403
436
// keep part of the audio for next iteration to try to mitigate word boundary issues
0 commit comments