Skip to content

Commit ad59b6c

Browse files
committed
examples : update stream to use VAD (new)
This commit updates the stream example to use the new Voice Activity Detection (VAD) support instead of the simple_vad which it currently used.
1 parent d26dc56 commit ad59b6c

File tree

2 files changed

+131
-83
lines changed

2 files changed

+131
-83
lines changed

examples/stream/README.md

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,41 @@
1-
# whisper.cpp/examples/stream
2-
3-
This is a naive example of performing real-time inference on audio from your microphone.
4-
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5-
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6-
7-
```bash
8-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9-
```
10-
11-
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12-
13-
## Sliding window mode with VAD
14-
15-
Setting the `--step` argument to `0` enables the sliding window mode:
16-
17-
```bash
18-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
19-
```
20-
21-
In this mode, the tool will transcribe only after some speech activity is detected. A very
22-
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
23-
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
24-
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
25-
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
26-
a transcription block that is suitable for parsing.
27-
28-
## Building
29-
30-
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
31-
32-
```bash
33-
# Install SDL2
34-
# On Debian based linux distributions:
35-
sudo apt-get install libsdl2-dev
36-
37-
# On Fedora Linux:
38-
sudo dnf install SDL2 SDL2-devel
39-
40-
# Install SDL2 on Mac OS
41-
brew install sdl2
42-
43-
cmake -B build -DWHISPER_SDL2=ON
44-
cmake --build build --config Release
45-
46-
./build/bin/whisper-stream
47-
```
48-
49-
## Web version
50-
51-
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
1+
# whisper.cpp/examples/stream
2+
3+
This is a naive example of performing real-time inference on audio from your microphone.
4+
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5+
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6+
7+
```bash
8+
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9+
```
10+
11+
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12+
13+
## VAD support
14+
15+
VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default
16+
`models/for-tests-silero-v5.1.2-ggml.bin` will be used).
17+
18+
## Building
19+
20+
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
21+
22+
```bash
23+
# Install SDL2
24+
# On Debian based linux distributions:
25+
sudo apt-get install libsdl2-dev
26+
27+
# On Fedora Linux:
28+
sudo dnf install SDL2 SDL2-devel
29+
30+
# Install SDL2 on Mac OS
31+
brew install sdl2
32+
33+
cmake -B build -DWHISPER_SDL2=ON
34+
cmake --build build --config Release
35+
36+
./build/bin/whisper-stream
37+
```
38+
39+
## Web version
40+
41+
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)

examples/stream/stream.cpp

Lines changed: 90 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,21 @@ struct whisper_params {
3737
bool save_audio = false; // save audio to wav file
3838
bool use_gpu = true;
3939
bool flash_attn = false;
40+
bool no_prints = false;
4041

4142
std::string language = "en";
4243
std::string model = "models/ggml-base.en.bin";
4344
std::string fname_out;
45+
46+
// Voice Activity Detection (VAD) parameters
47+
bool vad = false;
48+
std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin";
49+
float vad_threshold = 0.5f;
50+
int vad_min_speech_duration_ms = 250;
51+
int vad_min_silence_duration_ms = 100;
52+
float vad_max_speech_duration_s = FLT_MAX;
53+
int vad_speech_pad_ms = 30;
54+
float vad_samples_overlap = 0.1f;
4455
};
4556

4657
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
6172
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
6273
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
6374
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
64-
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
65-
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
6675
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
6776
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
6877
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
7483
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
7584
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
7685
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
77-
86+
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
87+
// Voice Activity Detection (VAD)
88+
else if ( arg == "--vad") { params.vad = true; }
89+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
90+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
91+
else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
92+
else if ( arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
93+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
94+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
95+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
7896
else {
7997
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
8098
whisper_print_usage(argc, argv, params);
@@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99117
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
100118
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101119
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
102-
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
103-
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
104120
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
105121
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
106122
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
@@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112128
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
113129
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
114130
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false");
131+
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
132+
// Voice Activity Detection (VAD) parameters
133+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
134+
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
135+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
136+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
137+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
138+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
139+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
140+
std::string("FLT_MAX").c_str() :
141+
std::to_string(params.vad_max_speech_duration_s).c_str());
142+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
143+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
115144
fprintf(stderr, "\n");
116145
}
117146

@@ -124,6 +153,17 @@ int main(int argc, char ** argv) {
124153
return 1;
125154
}
126155

156+
if (params.no_prints) {
157+
whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL);
158+
}
159+
160+
if (params.vad) {
161+
// For VAD, ensure at least 500 of context
162+
params.keep_ms = std::max(params.keep_ms, 500);
163+
} else {
164+
params.keep_ms = std::min(params.keep_ms, params.step_ms);
165+
}
166+
127167
params.keep_ms = std::min(params.keep_ms, params.step_ms);
128168
params.length_ms = std::max(params.length_ms, params.step_ms);
129169

@@ -132,7 +172,7 @@ int main(int argc, char ** argv) {
132172
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
133173
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
134174

135-
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
175+
const bool use_vad = params.vad;
136176

137177
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
138178

@@ -242,6 +282,30 @@ int main(int argc, char ** argv) {
242282
break;
243283
}
244284

285+
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
286+
287+
wparams.print_progress = false;
288+
wparams.print_special = params.print_special;
289+
wparams.print_realtime = false;
290+
wparams.print_timestamps = !params.no_timestamps;
291+
wparams.translate = params.translate;
292+
wparams.single_segment = !use_vad;
293+
wparams.max_tokens = params.max_tokens;
294+
wparams.language = params.language.c_str();
295+
wparams.n_threads = params.n_threads;
296+
wparams.beam_search.beam_size = params.beam_size;
297+
298+
wparams.audio_ctx = params.audio_ctx;
299+
300+
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
301+
302+
// disable temperature fallback
303+
//wparams.temperature_inc = -1.0f;
304+
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
305+
306+
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
307+
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
308+
245309
// process new audio
246310

247311
if (!use_vad) {
@@ -295,8 +359,26 @@ int main(int argc, char ** argv) {
295359

296360
audio.get(2000, pcmf32_new);
297361

298-
if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
299-
audio.get(params.length_ms, pcmf32);
362+
whisper_full_params wvparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
363+
wvparams.vad = params.vad;
364+
wvparams.vad_model_path = params.vad_model.c_str();
365+
wvparams.vad_params.threshold = params.vad_threshold;
366+
wvparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
367+
wvparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
368+
wvparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
369+
wvparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
370+
wvparams.vad_params.samples_overlap = params.vad_samples_overlap;
371+
372+
float * vad_samples;
373+
int n_vad_samples;
374+
if (whisper_vad(ctx, wvparams, pcmf32_new.data(), pcmf32_new.size(), &vad_samples, &n_vad_samples)) {
375+
if (n_vad_samples == 0) {
376+
std::this_thread::sleep_for(std::chrono::milliseconds(100));
377+
continue;
378+
}
379+
380+
pcmf32.assign(vad_samples, vad_samples + n_vad_samples);
381+
free(vad_samples);
300382
} else {
301383
std::this_thread::sleep_for(std::chrono::milliseconds(100));
302384

@@ -308,30 +390,6 @@ int main(int argc, char ** argv) {
308390

309391
// run the inference
310392
{
311-
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
312-
313-
wparams.print_progress = false;
314-
wparams.print_special = params.print_special;
315-
wparams.print_realtime = false;
316-
wparams.print_timestamps = !params.no_timestamps;
317-
wparams.translate = params.translate;
318-
wparams.single_segment = !use_vad;
319-
wparams.max_tokens = params.max_tokens;
320-
wparams.language = params.language.c_str();
321-
wparams.n_threads = params.n_threads;
322-
wparams.beam_search.beam_size = params.beam_size;
323-
324-
wparams.audio_ctx = params.audio_ctx;
325-
326-
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
327-
328-
// disable temperature fallback
329-
//wparams.temperature_inc = -1.0f;
330-
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
331-
332-
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
333-
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
334-
335393
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
336394
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
337395
return 6;

0 commit comments

Comments
 (0)