Skip to content

Commit b924706

Browse files
committed
examples : update vad support in stream example [no ci]
wip
1 parent d1f114d commit b924706

File tree

2 files changed

+114
-91
lines changed

2 files changed

+114
-91
lines changed

examples/stream/README.md

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,41 @@
1-
# whisper.cpp/examples/stream
2-
3-
This is a naive example of performing real-time inference on audio from your microphone.
4-
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5-
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6-
7-
```bash
8-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9-
```
10-
11-
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12-
13-
## Sliding window mode with VAD
14-
15-
Setting the `--step` argument to `0` enables the sliding window mode:
16-
17-
```bash
18-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
19-
```
20-
21-
In this mode, the tool will transcribe only after some speech activity is detected. A very
22-
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
23-
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
24-
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
25-
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
26-
a transcription block that is suitable for parsing.
27-
28-
## Building
29-
30-
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
31-
32-
```bash
33-
# Install SDL2
34-
# On Debian based linux distributions:
35-
sudo apt-get install libsdl2-dev
36-
37-
# On Fedora Linux:
38-
sudo dnf install SDL2 SDL2-devel
39-
40-
# Install SDL2 on Mac OS
41-
brew install sdl2
42-
43-
cmake -B build -DWHISPER_SDL2=ON
44-
cmake --build build --config Release
45-
46-
./build/bin/whisper-stream
47-
```
48-
49-
## Web version
50-
51-
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
1+
# whisper.cpp/examples/stream
2+
3+
This is a naive example of performing real-time inference on audio from your microphone.
4+
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5+
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6+
7+
```bash
8+
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9+
```
10+
11+
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12+
13+
## VAD support
14+
15+
VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default
16+
`models/for-tests-silero-v5.1.2-ggml.bin` will be used).
17+
18+
## Building
19+
20+
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
21+
22+
```bash
23+
# Install SDL2
24+
# On Debian based linux distributions:
25+
sudo apt-get install libsdl2-dev
26+
27+
# On Fedora Linux:
28+
sudo dnf install SDL2 SDL2-devel
29+
30+
# Install SDL2 on Mac OS
31+
brew install sdl2
32+
33+
cmake -B build -DWHISPER_SDL2=ON
34+
cmake --build build --config Release
35+
36+
./build/bin/whisper-stream
37+
```
38+
39+
## Web version
40+
41+
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)

examples/stream/stream.cpp

Lines changed: 73 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,6 @@ struct whisper_params {
2525
int32_t audio_ctx = 0;
2626
int32_t beam_size = -1;
2727

28-
float vad_thold = 0.6f;
29-
float freq_thold = 100.0f;
30-
3128
bool translate = false;
3229
bool no_fallback = false;
3330
bool print_special = false;
@@ -37,10 +34,21 @@ struct whisper_params {
3734
bool save_audio = false; // save audio to wav file
3835
bool use_gpu = true;
3936
bool flash_attn = false;
37+
bool no_prints = false;
4038

4139
std::string language = "en";
4240
std::string model = "models/ggml-base.en.bin";
4341
std::string fname_out;
42+
43+
// Voice Activity Detection (VAD) parameters
44+
bool vad = false;
45+
std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin";
46+
float vad_threshold = 0.5f;
47+
int vad_min_speech_duration_ms = 250;
48+
int vad_min_silence_duration_ms = 100;
49+
float vad_max_speech_duration_s = FLT_MAX;
50+
int vad_speech_pad_ms = 30;
51+
float vad_samples_overlap = 0.1f;
4452
};
4553

4654
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
6169
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
6270
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
6371
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
64-
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
65-
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
6672
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
6773
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
6874
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
7480
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
7581
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
7682
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
77-
83+
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
84+
// Voice Activity Detection (VAD)
85+
else if ( arg == "--vad") { params.vad = true; }
86+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
87+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
88+
else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
89+
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
90+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
91+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
92+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
7893
else {
7994
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
8095
whisper_print_usage(argc, argv, params);
@@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99114
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
100115
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101116
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
102-
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
103-
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
104117
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
105118
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
106119
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
@@ -112,6 +125,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112125
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
113126
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
114127
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false");
128+
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
129+
// Voice Activity Detection (VAD) parameters
130+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
131+
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
132+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
133+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
134+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
135+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
136+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
137+
std::string("FLT_MAX").c_str() :
138+
std::to_string(params.vad_max_speech_duration_s).c_str());
139+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
140+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
115141
fprintf(stderr, "\n");
116142
}
117143

@@ -122,20 +148,22 @@ int main(int argc, char ** argv) {
122148
return 1;
123149
}
124150

151+
if (params.no_prints) {
152+
whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL);
153+
}
154+
125155
params.keep_ms = std::min(params.keep_ms, params.step_ms);
126156
params.length_ms = std::max(params.length_ms, params.step_ms);
127157

158+
128159
const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
129160
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
130161
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
131162
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
132163

133-
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
134-
135-
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
136-
137-
params.no_timestamps = !use_vad;
138-
params.no_context |= use_vad;
164+
const int n_new_line = !params.vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
165+
params.no_timestamps = !params.vad;
166+
params.no_context |= params.vad;
139167
params.max_tokens = 0;
140168

141169
// init audio
@@ -189,7 +217,7 @@ int main(int argc, char ** argv) {
189217
params.translate ? "translate" : "transcribe",
190218
params.no_timestamps ? 0 : 1);
191219

192-
if (!use_vad) {
220+
if (!params.vad) {
193221
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
194222
} else {
195223
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
@@ -242,7 +270,7 @@ int main(int argc, char ** argv) {
242270

243271
// process new audio
244272

245-
if (!use_vad) {
273+
if (!params.vad) {
246274
while (true) {
247275
// handle Ctrl + C
248276
is_running = sdl_poll_events();
@@ -270,7 +298,7 @@ int main(int argc, char ** argv) {
270298
// take up to params.length_ms audio from previous iteration
271299
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
272300

273-
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
301+
//fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
274302

275303
pcmf32.resize(n_samples_new + n_samples_take);
276304

@@ -285,22 +313,29 @@ int main(int argc, char ** argv) {
285313
const auto t_now = std::chrono::high_resolution_clock::now();
286314
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
287315

288-
if (t_diff < 2000) {
289-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
290-
316+
if (t_diff < params.step_ms) {
317+
std::this_thread::sleep_for(std::chrono::milliseconds(params.step_ms));
291318
continue;
292319
}
293320

294-
audio.get(2000, pcmf32_new);
321+
// Get new audio for this step
322+
audio.get(params.step_ms, pcmf32_new);
295323

296-
if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
297-
audio.get(params.length_ms, pcmf32);
298-
} else {
299-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
324+
// Calculate how much old audio to keep
325+
const int n_samples_new = pcmf32_new.size();
326+
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
300327

301-
continue;
328+
// Combine old + new audio with overlap
329+
pcmf32.resize(n_samples_new + n_samples_take);
330+
331+
// Copy kept portion from previous iteration
332+
for (int i = 0; i < n_samples_take; i++) {
333+
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
302334
}
303335

336+
// Append new audio
337+
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new * sizeof(float));
338+
304339
t_last = t_now;
305340
}
306341

@@ -313,7 +348,6 @@ int main(int argc, char ** argv) {
313348
wparams.print_realtime = false;
314349
wparams.print_timestamps = !params.no_timestamps;
315350
wparams.translate = params.translate;
316-
wparams.single_segment = !use_vad;
317351
wparams.max_tokens = params.max_tokens;
318352
wparams.language = params.language.c_str();
319353
wparams.n_threads = params.n_threads;
@@ -330,27 +364,30 @@ int main(int argc, char ** argv) {
330364
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
331365
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
332366

367+
wparams.vad = params.vad;
368+
wparams.vad_model_path = params.vad_model.c_str();
369+
370+
wparams.vad_params.threshold = params.vad_threshold;
371+
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
372+
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
373+
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
374+
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
375+
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
376+
333377
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
334378
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
335379
return 6;
336380
}
337381

338382
// print result;
339383
{
340-
if (!use_vad) {
384+
if (!params.vad) {
341385
printf("\33[2K\r");
342386

343387
// print long empty line to clear the previous line
344388
printf("%s", std::string(100, ' ').c_str());
345389

346390
printf("\33[2K\r");
347-
} else {
348-
const int64_t t1 = (t_last - t_start).count()/1000000;
349-
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
350-
351-
printf("\n");
352-
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
353-
printf("\n");
354391
}
355392

356393
const int n_segments = whisper_full_n_segments(ctx);
@@ -389,15 +426,11 @@ int main(int argc, char ** argv) {
389426
fout << std::endl;
390427
}
391428

392-
if (use_vad) {
393-
printf("\n");
394-
printf("### Transcription %d END\n", n_iter);
395-
}
396429
}
397430

398431
++n_iter;
399432

400-
if (!use_vad && (n_iter % n_new_line) == 0) {
433+
if (!params.vad && (n_iter % n_new_line) == 0) {
401434
printf("\n");
402435

403436
// keep part of the audio for next iteration to try to mitigate word boundary issues

0 commit comments

Comments
 (0)