-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecognize.cpp
More file actions
1336 lines (1149 loc) · 55.5 KB
/
recognize.cpp
File metadata and controls
1336 lines (1149 loc) · 55.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Real-time speech recognition with CoreML support for macOS
// Based on whisper.cpp/examples/stream/stream.cpp with CoreML optimizations
#include "common-sdl.h"
#include "common.h"
#include "common-whisper.h"
#include "whisper.h"
#include "model_manager.h"
#include "config_manager.h"
#include "export_manager.h"
#include "ptt_manager.h"
#include "history_manager.h"
#include "text_processing.h"
#include "meeting_manager.h"
#include "audio_processor.h"
#include "cli_parser.h"
#include "whisper_params.h"
#include <chrono>
#include <cstdio>
#include <fstream>
#include <string>
#include <thread>
#include <vector>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <csignal>
#include <atomic>
#include <termios.h>
#include <unistd.h>
#include <fcntl.h>
#include <filesystem>
#include <iomanip>
#ifdef __APPLE__
#include <pthread.h>
#endif
// Global state for signal handling
std::atomic<bool> g_interrupt_received(false);
std::atomic<bool> g_is_recording(false);
// Signal handler for graceful shutdown — only sets atomic flag (async-signal-safe)
void signal_handler(int signal) {
if (signal == SIGINT) {
g_interrupt_received.store(true);
}
}
// Suppress stderr output during initialization (SDL device listing, whisper model info, etc.)
// Returns saved fd for later restoration, or -1 on failure
static int suppress_stderr() {
fflush(stderr);
int saved_fd = dup(STDERR_FILENO);
int devnull = open("/dev/null", O_WRONLY);
if (devnull >= 0) {
dup2(devnull, STDERR_FILENO);
close(devnull);
}
return saved_fd;
}
static void restore_stderr(int saved_fd) {
if (saved_fd >= 0) {
fflush(stderr);
dup2(saved_fd, STDERR_FILENO);
close(saved_fd);
}
}
// Silent log callback for whisper/ggml during initialization
static void silent_log_callback(enum ggml_log_level /*level*/, const char * /*text*/, void * /*user_data*/) {
// Intentionally empty — suppress all init chatter
}
// Check for interrupt in main loop and handle confirmation dialog safely
static bool check_interrupt_with_confirmation() {
if (!g_interrupt_received.load()) return false;
if (g_is_recording.load()) {
// If no TTY attached (background process), exit immediately
if (!isatty(STDIN_FILENO)) {
return true;
}
std::cout << "\n\n Recording in progress! Are you sure you want to quit? (y/N): " << std::flush;
struct termios old_termios, new_termios;
tcgetattr(STDIN_FILENO, &old_termios);
new_termios = old_termios;
new_termios.c_lflag &= ~(ICANON | ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
char c = getchar();
tcsetattr(STDIN_FILENO, TCSANOW, &old_termios);
if (c == 'y' || c == 'Y') {
std::cout << "\n Stopping recording and exiting...\n" << std::endl;
g_is_recording.store(false); // prevent re-prompting on subsequent checks
return true;
} else {
std::cout << "\n Continuing recording...\n" << std::endl;
g_interrupt_received.store(false);
return false;
}
}
// Not recording — exit immediately
return true;
}
void perform_auto_copy(AutoCopySession& session, const whisper_params& params) {
if (!should_auto_copy(session, params)) {
return;
}
std::string content = session.transcription_buffer.str();
content = trim_whitespace(content);
if (content.empty()) {
fprintf(stderr, "Auto-copy skipped: no content to copy.\n");
return;
}
auto now = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::hours>(now - session.start_time);
// Check duration limit
if (duration.count() > params.auto_copy_max_duration_hours) {
fprintf(stderr, "Auto-copy skipped: session duration (%ld hours) exceeded limit (%d hours).\n",
duration.count(), params.auto_copy_max_duration_hours);
return;
}
// Check size limit
if (content.size() > static_cast<size_t>(params.auto_copy_max_size_bytes)) {
fprintf(stderr, "Auto-copy skipped: content size (%zu bytes) exceeded limit (%d bytes).\n",
content.size(), params.auto_copy_max_size_bytes);
return;
}
// Perform the copy
if (copy_to_clipboard_macos(content)) {
fprintf(stderr, "Transcription copied.\n");
session.has_been_copied = true;
} else {
fprintf(stderr, "Auto-copy failed: unable to copy to clipboard.\n");
}
}
void perform_export(ExportSession& session, const whisper_params& params) {
if (!params.export_enabled || session.segments.empty()) {
return;
}
// Validate export format
auto supported_formats = ExportManager::get_supported_formats();
bool format_valid = false;
for (const auto& format : supported_formats) {
if (format == params.export_format) {
format_valid = true;
break;
}
}
if (!format_valid) {
fprintf(stderr, "Export failed: unsupported format '%s'. Supported formats: ", params.export_format.c_str());
for (size_t i = 0; i < supported_formats.size(); ++i) {
fprintf(stderr, "%s", supported_formats[i].c_str());
if (i < supported_formats.size() - 1) fprintf(stderr, ", ");
}
fprintf(stderr, "\n");
return;
}
// Setup export manager
ExportManager export_manager;
// Set export format
ExportFormat format = ExportManager::extension_to_format("." + params.export_format);
export_manager.set_format(format);
// Set output file
if (!params.export_file.empty()) {
export_manager.set_output_file(params.export_file);
}
export_manager.set_auto_filename(params.export_auto_filename);
// Set export options
export_manager.set_include_metadata(params.export_include_metadata);
export_manager.set_include_timestamps(params.export_include_timestamps);
export_manager.set_include_confidence(params.export_include_confidence);
// Add all segments
for (const auto& segment : session.segments) {
export_manager.add_segment(segment);
}
// Set session metadata
session.metadata.end_time = std::chrono::system_clock::now();
session.metadata.total_segments = session.segments.size();
if (!session.segments.empty()) {
int64_t total_duration = session.segments.back().end_time_ms - session.segments.front().start_time_ms;
session.metadata.total_duration_seconds = total_duration / 1000.0;
}
export_manager.set_metadata(session.metadata);
// Perform export
if (export_manager.export_transcription()) {
fprintf(stderr, "Export completed successfully.\n");
} else {
fprintf(stderr, "Export failed.\n");
}
}
// Finalize session: auto-copy, export, and meeting processing
static void finalize_session(const whisper_params& params,
AutoCopySession& auto_copy_session,
ExportSession& export_session,
SpeakerTracker& speaker_tracker,
MeetingSession& meeting_session,
const std::string& meeting_output_file,
const std::string& transcript_text,
std::chrono::high_resolution_clock::time_point session_start) {
if (params.auto_copy_enabled) {
perform_auto_copy(auto_copy_session, params);
}
if (params.export_enabled) {
export_session.metadata.total_speakers = speaker_tracker.total_speakers;
perform_export(export_session, params);
}
if (params.meeting_mode) {
meeting_session.total_speakers = std::max(meeting_session.total_speakers, speaker_tracker.total_speakers);
double duration_minutes = meeting_session.get_duration_minutes();
std::cerr << "\nProcessing meeting transcription with Claude CLI..." << std::endl;
std::cerr << "Duration: " << static_cast<int>(duration_minutes) << " minutes, "
<< "Speakers detected: " << meeting_session.total_speakers << std::endl;
bool success = process_meeting_transcription(
meeting_session.get_transcription(),
params.meeting_prompt,
meeting_output_file,
params.meeting_timeout,
duration_minutes,
params.meeting_max_single_pass
);
if (!success) {
std::ofstream raw_file(meeting_output_file);
if (raw_file.is_open()) {
auto now = std::chrono::system_clock::now();
auto time_t = std::chrono::system_clock::to_time_t(now);
std::tm tm_buf;
localtime_r(&time_t, &tm_buf);
raw_file << "# Meeting Transcription\n\n";
raw_file << "**Date**: " << std::put_time(&tm_buf, "%Y-%m-%d %H:%M") << "\n";
raw_file << "**Duration**: " << static_cast<int>(duration_minutes) << " minutes\n";
raw_file << "**Speakers**: " << meeting_session.total_speakers << "\n";
raw_file << "**Session ID**: " << meeting_session.session_id << "\n\n";
raw_file << "---\n\n";
raw_file << "## Raw Transcription\n\n";
raw_file << meeting_session.get_transcription();
raw_file.close();
std::cerr << "Transcription saved to: " << meeting_output_file << std::endl;
} else {
std::cerr << "Failed to save transcription to file" << std::endl;
}
}
}
// Save to history
if (params.history_enabled && !transcript_text.empty()) {
HistoryManager history;
if (history.open()) {
auto now = std::chrono::high_resolution_clock::now();
double duration_s = std::chrono::duration<double>(now - session_start).count();
std::string mode = params.ptt_mode ? "ptt" :
params.meeting_mode ? "meeting" :
(params.silence_timeout > 0) ? "auto-stop" : "continuous";
history.save(transcript_text, duration_s, params.model, mode);
}
}
}
int main(int argc, char ** argv) {
// Handle "history" subcommand before any heavy initialization
if (argc >= 2 && std::string(argv[1]) == "history") {
return handle_history_command(argc - 2, argv + 2);
}
ggml_backend_load_all();
// Register signal handler for graceful shutdown
signal(SIGINT, signal_handler);
whisper_params params;
// Load configuration before parsing command line
ConfigManager config_manager;
config_manager.load_config();
config_manager.apply_to_params(params);
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.silence_timeout < 0.0f) {
fprintf(stderr, "error: --silence-timeout must be non-negative\n");
return 1;
}
// Validate PTT settings
if (params.ptt_mode) {
int ptt_key_code = PushToTalkManager::key_name_to_code(params.ptt_key);
if (ptt_key_code < 0) {
fprintf(stderr, "error: unknown PTT key '%s'. Valid keys: space, right_option, right_ctrl, fn, f13\n", params.ptt_key.c_str());
return 1;
}
if (params.meeting_mode) {
fprintf(stderr, "error: --ptt is incompatible with --meeting (meetings require continuous recording)\n");
return 1;
}
if (params.silence_timeout > 0.0f) {
fprintf(stderr, "note: --silence-timeout ignored in PTT mode (key release stops recording)\n");
params.silence_timeout = 0.0f;
}
if (params.ptt_pre_roll_ms < 0 || params.ptt_pre_roll_ms > 2000) {
fprintf(stderr, "error: --ptt-pre-roll must be 0-2000ms\n");
return 1;
}
}
// Validate refine: check claude CLI is available early
if (params.refine && !is_claude_cli_available()) {
fprintf(stderr, "error: --refine requires Claude CLI. Install from: https://claude.ai/code\n");
return 1;
}
// Initialize model manager
ModelManager model_manager;
// Apply configured models directory if set
ConfigData effective_config = config_manager.get_effective_config();
if (effective_config.models_directory) {
model_manager.set_models_directory(*effective_config.models_directory);
}
// Handle model management commands (exit early if matched)
int cmd_result = handle_model_commands(params, model_manager);
if (cmd_result >= 0) return cmd_result;
// Show clean loading state
const bool stderr_is_tty = isatty(STDERR_FILENO);
// Resolve model (with auto-download if needed)
std::string resolved_model = model_manager.resolve_model(params.model, params.use_coreml);
if (resolved_model.empty()) {
std::cerr << "\n❌ No model available. Exiting.\n";
return 1;
}
// Update params with resolved model path
params.model = resolved_model;
// Extract short model name for display (e.g. "ggml-large-v3-turbo.bin" → "large-v3-turbo")
std::string display_model = std::filesystem::path(resolved_model).stem().string();
if (display_model.rfind("ggml-", 0) == 0) {
display_model = display_model.substr(5);
}
if (stderr_is_tty) {
fprintf(stderr, "[Loading %s...]\n", display_model.c_str());
}
// Auto-set CoreML model path if CoreML is enabled and not explicitly set
if (params.use_coreml && params.coreml_model.empty()) {
// Extract model name from the resolved path
std::filesystem::path model_path(resolved_model);
std::string model_filename = model_path.filename().string();
// Try to find corresponding model name
for (const auto& name : model_manager.get_model_names()) {
if (model_manager.get_model_path(name) == resolved_model) {
std::string coreml_path = model_manager.get_coreml_model_path(name);
if (model_manager.coreml_model_exists(name)) {
params.coreml_model = coreml_path;
} else {
params.use_coreml = false; // Disable CoreML to prevent crashes
}
break;
}
}
}
// Adjust thread count based on hardware acceleration
// When CoreML + Metal are active, encoder runs on ANE and decoder on GPU,
// so very few CPU threads are needed (just orchestration overhead).
{
int default_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
if (params.use_coreml && params.use_gpu) {
// CoreML encoder on ANE + Metal decoder: minimal CPU threads
if (params.n_threads == default_threads) {
params.n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
}
} else if (!params.use_coreml && params.use_gpu) {
// Metal only: GPU handles most compute
if (params.n_threads == default_threads) {
params.n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
}
} else if (params.n_threads <= 4) {
// CPU only: use more threads
params.n_threads = std::min(8, (int32_t)std::thread::hardware_concurrency());
}
}
// Resolve VAD model path (supports "auto" for auto-download)
if (!params.vad_model_path.empty()) {
params.vad_model_path = model_manager.resolve_vad_model(params.vad_model_path);
}
// Meeting mode: apply optimized defaults (can still be overridden by explicit CLI args)
// These are set after CLI parsing so they only apply if the user didn't explicitly set them
if (params.meeting_mode) {
// Auto-enable speaker turn detection for meetings
if (!params.tinydiarize) params.tinydiarize = true;
// Better accuracy defaults for meeting transcription
if (params.keep_ms == 200) params.keep_ms = 1000; // Research: 1000ms optimal overlap
if (params.step_ms == 3000) params.step_ms = 5000; // Longer processing windows
if (params.length_ms == 10000) params.length_ms = 15000;
if (params.beam_size <= 0) params.beam_size = 5; // Beam search for meetings
if (params.freq_thold == 100.0f) params.freq_thold = 200.0f; // Better high-pass filter
if (params.initial_prompt.empty()) {
params.initial_prompt = "Meeting transcription with proper punctuation and capitalization.";
}
}
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1;
params.no_timestamps = !use_vad;
// In meeting mode, keep context between chunks for better accuracy
if (!params.meeting_mode) {
params.no_context |= use_vad;
}
if (use_vad) params.max_tokens = 0;
// Init audio — suppress SDL device listing during init
// PTT mode needs large buffer for long recordings; standard mode uses length_ms
int audio_buffer_ms = params.length_ms;
if (params.ptt_mode) {
audio_buffer_ms = std::max(audio_buffer_ms, 600000 + params.ptt_pre_roll_ms);
}
audio_async audio(audio_buffer_ms);
int saved_stderr = suppress_stderr();
bool audio_ok = audio.init(params.capture_id, WHISPER_SAMPLE_RATE);
restore_stderr(saved_stderr);
if (!audio_ok) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// Request P-core scheduling for the inference thread
#ifdef __APPLE__
pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0);
#endif
// Set recording state for signal handler
g_is_recording.store(true);
// Whisper init with CoreML support
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
struct whisper_context_params cparams = whisper_context_default_params();
// Configure CoreML if available and requested
#ifdef WHISPER_COREML
if (params.use_coreml && params.coreml_no_ane) {
setenv("WHISPER_COREML_NO_ANE", "1", 1);
}
if (params.use_coreml) {
cparams.use_gpu = params.use_gpu; // Metal for decoder, CoreML for encoder
} else {
cparams.use_gpu = params.use_gpu;
}
#else
cparams.use_gpu = params.use_gpu;
if (params.use_coreml) {
fprintf(stderr, "warning: CoreML requested but not compiled with CoreML support\n");
}
#endif
cparams.flash_attn = params.flash_attn;
// Suppress whisper/ggml verbose logging during model load
whisper_log_set(silent_log_callback, nullptr);
ggml_log_set(silent_log_callback, nullptr);
auto t_init_start = std::chrono::high_resolution_clock::now();
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
// Restore logging before printing error
whisper_log_set(nullptr, nullptr);
ggml_log_set(nullptr, nullptr);
fprintf(stderr, "error: failed to initialize whisper context\n");
return 2;
}
if (stderr_is_tty) {
auto t_model_loaded = std::chrono::high_resolution_clock::now();
auto model_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_model_loaded - t_init_start).count();
if (model_ms > 1000) {
fprintf(stderr, "[Model loaded in %.1fs]\n", model_ms / 1000.0);
}
}
// CoreML warm-up: first inference triggers ANE compilation
// For large models this can take 30s+ on first run (ANE caches for subsequent runs)
// Skip when coreml_no_ane — CPU+GPU mode has no ANE compilation overhead
#ifdef WHISPER_COREML
if (params.use_coreml && !params.coreml_no_ane) {
if (stderr_is_tty) {
fprintf(stderr, "[Warming up CoreML (first run may take a while)...]\n");
fflush(stderr);
}
auto t_warmup_start = std::chrono::high_resolution_clock::now();
std::vector<float> warmup(WHISPER_SAMPLE_RATE * 1, 0.0f); // 1 second of silence
whisper_full_params wp = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wp.print_realtime = false;
wp.print_progress = false;
wp.print_timestamps = false;
wp.print_special = false;
wp.n_threads = params.n_threads;
wp.single_segment = true;
wp.max_tokens = 1;
whisper_full(ctx, wp, warmup.data(), warmup.size());
if (stderr_is_tty) {
auto t_warmup_end = std::chrono::high_resolution_clock::now();
auto warmup_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_warmup_end - t_warmup_start).count();
fprintf(stderr, "[CoreML ready in %.1fs]\n", warmup_ms / 1000.0);
}
}
#endif
// Keep whisper/ggml logging suppressed for clean UX
// (Metal kernel JIT compilation can log during first real inference)
// Validate output mode
if (params.output_mode != "original" && params.output_mode != "english" && params.output_mode != "bilingual") {
fprintf(stderr, "error: invalid output mode '%s'. Valid modes: original, english, bilingual\n", params.output_mode.c_str());
whisper_free(ctx);
return 1;
}
// Check compatibility between translate flag and output mode
if (params.translate) {
// If --translate is used, default to "english" mode for compatibility
if (params.output_mode == "original") {
params.output_mode = "english";
fprintf(stderr, "%s: --translate flag detected, switching to 'english' output mode\n", __func__);
}
}
// Check if translation is supported for non-original modes
bool needs_translation = (params.output_mode == "english" || params.output_mode == "bilingual");
if (needs_translation && !whisper_is_multilingual(ctx)) {
fprintf(stderr, "error: output mode '%s' requires a multilingual model, but current model is English-only\n", params.output_mode.c_str());
whisper_free(ctx);
return 1;
}
// For bilingual mode, we need a second context for translation
struct whisper_context * ctx_translate = nullptr;
if (params.output_mode == "bilingual") {
ctx_translate = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx_translate == nullptr) {
fprintf(stderr, "error: failed to initialize translation context for bilingual mode\n");
whisper_free(ctx);
return 2;
}
}
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
pcmf32_old.reserve(n_samples_30s);
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
std::vector<whisper_token> prompt_tokens;
// Auto-correct language settings for non-multilingual models
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
}
}
int n_iter = 0;
bool is_running = true;
const bool stdout_is_tty = isatty(STDOUT_FILENO);
// When stdout is not a TTY (redirected to file/pipe), accumulate text
// and dump it on exit instead of streaming with ANSI codes
std::string pipe_finalized_text;
std::ostringstream pipe_current_text;
std::ofstream fout;
if (params.fname_out.length() > 0) {
fout.open(params.fname_out);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
whisper_free(ctx);
if (ctx_translate) whisper_free(ctx_translate);
return 1;
}
}
wav_writer wavWriter;
if (params.save_audio) {
time_t now = time(0);
char buffer[80];
std::tm tm_buf;
localtime_r(&now, &tm_buf);
strftime(buffer, sizeof(buffer), "%Y%m%d%H%M%S", &tm_buf);
std::string filename = std::string(buffer) + ".wav";
wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
}
// Initialize auto-copy session
AutoCopySession auto_copy_session;
// Initialize export session
ExportSession export_session;
if (params.export_enabled) {
// Setup session metadata
export_session.metadata.session_id = export_session.session_id;
export_session.metadata.start_time = std::chrono::system_clock::now();
export_session.metadata.model_name = params.model;
export_session.metadata.language = params.language;
export_session.metadata.coreml_enabled = params.use_coreml;
export_session.metadata.thread_count = params.n_threads;
export_session.metadata.vad_threshold = params.vad_thold;
export_session.metadata.step_ms = params.step_ms;
export_session.metadata.length_ms = params.length_ms;
#ifdef RECOGNIZE_VERSION
export_session.metadata.version = "recognize-" RECOGNIZE_VERSION;
#else
export_session.metadata.version = "recognize-dev";
#endif
}
// Initialize speaker tracker and meeting session
SpeakerTracker speaker_tracker;
MeetingSession meeting_session;
std::string meeting_output_file;
if (params.meeting_mode) {
meeting_output_file = generate_meeting_filename(params.meeting_name);
}
// ─── Push-to-Talk mode ───────────────────────────────────────────────
if (params.ptt_mode) {
// For English input, force transcription mode instead of translation.
// Whisper's translate mode with language="en" is untested/unsupported —
// the model was trained for X→English, not English→English. Using
// translate=true with English audio produces empty segments and degraded
// quality (whisper.cpp issues #2065, #2678, #3278).
if (params.language == "en" && params.output_mode == "english") {
params.output_mode = "original";
}
PushToTalkManager ptt;
int ptt_key_code = PushToTalkManager::key_name_to_code(params.ptt_key);
if (!ptt.start(ptt_key_code)) {
fprintf(stderr, "Failed to start PTT. Check Input Monitoring permissions.\n");
whisper_free(ctx);
if (ctx_translate) whisper_free(ctx_translate);
return 7;
}
// Signal readiness — PTT_READY on stderr for script detection (keeps stdout clean for transcript)
fprintf(stderr, "PTT_READY\n");
if (stderr_is_tty) {
fprintf(stderr, "[Ready — hold %s to record, release to transcribe]\n",
params.ptt_key.c_str());
}
fflush(stderr);
bool is_running_ptt = true;
std::string ptt_pipe_text;
const bool stderr_tty = isatty(STDERR_FILENO);
// PTT: single-shot — record once, transcribe, exit (no quit confirmation)
while (is_running_ptt && !g_interrupt_received.load()) {
// Wait for key press
while (!ptt.is_key_held() && is_running_ptt && !g_interrupt_received.load()) {
is_running_ptt = sdl_poll_events();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
if (!is_running_ptt || g_interrupt_received.load()) break;
// Key pressed — start capture
auto t_press = std::chrono::high_resolution_clock::now();
if (stderr_tty) fprintf(stderr, "\r[Recording...] ");
// Capture while key is held, with periodic streaming preview
auto t_last_preview = t_press;
std::string last_preview_text;
while (ptt.is_key_held() && is_running_ptt && !g_interrupt_received.load()) {
is_running_ptt = sdl_poll_events();
auto now = std::chrono::high_resolution_clock::now();
int elapsed_ms = static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(now - t_press).count());
int since_preview = static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(now - t_last_preview).count());
// Run streaming preview every 3s, starting after 2s of recording
if (elapsed_ms >= 2000 && since_preview >= 3000) {
t_last_preview = now;
// Grab recent audio (up to 28s for single whisper window)
int preview_ms = std::min(elapsed_ms + params.ptt_pre_roll_ms, 28000);
std::vector<float> preview_audio;
audio.get(preview_ms, preview_audio);
if (params.normalize_audio) normalize_audio(preview_audio);
whisper_params preview_params = params;
preview_params.beam_size = 1;
preview_params.no_fallback = true;
preview_params.no_context = true;
preview_params.max_tokens = 128;
std::vector<BilingualSegment> preview_results;
if (process_audio_segment(ctx, nullptr, preview_params, preview_audio,
preview_results, prompt_tokens) == 0) {
std::string preview_text;
for (const auto& seg : preview_results) {
if (!seg.original_text.empty())
preview_text += seg.original_text;
else if (!seg.english_text.empty())
preview_text += seg.english_text;
}
preview_text = filter_hallucinations(preview_text);
if (!preview_text.empty() && preview_text != last_preview_text) {
// Show short tail (~80 chars) with duration indicator
std::string display = preview_text;
if (display.size() > 80) {
display = "..." + display.substr(display.size() - 77);
}
if (stderr_tty) {
fprintf(stderr, "\r\033[2K[%ds]%s", elapsed_ms / 1000, display.c_str());
}
fprintf(stderr, "\n[PREVIEW %ds]%s\n", elapsed_ms / 1000, display.c_str());
fflush(stderr);
last_preview_text = preview_text;
}
}
if (stderr_tty) fprintf(stderr, "\r[Recording...] ");
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
if (!is_running_ptt || g_interrupt_received.load()) break;
// Key released — get audio and transcribe
auto t_release = std::chrono::high_resolution_clock::now();
int duration_ms = static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(
t_release - t_press).count());
if (duration_ms < 200) {
// Too short, likely accidental tap — wait for another press
if (stderr_tty) fprintf(stderr, "\r[Too short, skipped] \n");
continue;
}
// Include pre-roll + lead-in as real audio from the circular buffer.
// Using real ambient audio instead of synthetic zeros avoids whisper's
// no-speech detection (trained on trailing silence, not leading zeros).
const int lead_in_ms = 200;
int total_ms = std::min(duration_ms + params.ptt_pre_roll_ms + lead_in_ms, audio_buffer_ms);
if (stderr_tty && duration_ms + params.ptt_pre_roll_ms + lead_in_ms > audio_buffer_ms) {
fprintf(stderr, "\r[Warning: held %.0fs but buffer holds %.0fs — beginning truncated]\n",
duration_ms / 1000.0, audio_buffer_ms / 1000.0);
}
std::vector<float> pcmf32_ptt;
audio.get(total_ms, pcmf32_ptt);
// Trim trailing silence to reduce inference time (RMS energy check)
{
const int samples_per_100ms = WHISPER_SAMPLE_RATE / 10;
const int min_samples = WHISPER_SAMPLE_RATE / 2; // keep at least 500ms
const float silence_rms_threshold = 0.01f;
while (static_cast<int>(pcmf32_ptt.size()) > min_samples + samples_per_100ms) {
float sum_sq = 0.0f;
for (size_t i = pcmf32_ptt.size() - samples_per_100ms; i < pcmf32_ptt.size(); ++i) {
sum_sq += pcmf32_ptt[i] * pcmf32_ptt[i];
}
float rms = std::sqrt(sum_sq / samples_per_100ms);
if (rms < silence_rms_threshold) {
pcmf32_ptt.resize(pcmf32_ptt.size() - samples_per_100ms);
} else {
break;
}
}
}
if (params.normalize_audio) {
normalize_audio(pcmf32_ptt);
}
float actual_duration_s = pcmf32_ptt.size() / static_cast<float>(WHISPER_SAMPLE_RATE);
if (stderr_tty) fprintf(stderr, "\r[Transcribing %.1fs...] ", actual_duration_s);
// PTT-optimized inference: beam search for quality, no temperature
// fallback (causes issues at chunk boundaries). Short chunks (15s)
// keep whisper reliable — longer audio causes attention dropout.
whisper_params ptt_params = params;
if (ptt_params.beam_size <= 0) ptt_params.beam_size = 5;
ptt_params.no_fallback = true;
ptt_params.no_context = true;
ptt_params.max_tokens = 256;
// PTT guarantees speech (user holds a button), so relax no-speech
// detection to prevent whisper from skipping onset segments.
ptt_params.no_speech_thold = 0.9f;
// Process audio in ≤15-second chunks. The first chunk already has
// real ambient audio as lead-in (from extended audio.get above).
// Subsequent chunks use overlap from the previous chunk's tail,
// matching whisper.cpp's keep_ms streaming pattern.
std::vector<BilingualSegment> bilingual_results;
const int chunk_samples = WHISPER_SAMPLE_RATE * 15;
const int overlap_samples = WHISPER_SAMPLE_RATE / 5; // 200ms overlap between chunks
bool inference_failed = false;
for (size_t offset = 0; offset < pcmf32_ptt.size(); offset += chunk_samples) {
size_t remaining = pcmf32_ptt.size() - offset;
size_t chunk_size = std::min(remaining, static_cast<size_t>(chunk_samples));
// Avoid a tiny trailing chunk (< 2s) — merge with previous
if (remaining > static_cast<size_t>(chunk_samples) &&
remaining < static_cast<size_t>(chunk_samples + WHISPER_SAMPLE_RATE * 2)) {
chunk_size = remaining;
}
std::vector<float> chunk;
if (offset == 0) {
// First chunk: real audio already includes lead-in from
// the extended audio.get() — no synthetic silence needed.
chunk.assign(pcmf32_ptt.begin(),
pcmf32_ptt.begin() + chunk_size);
} else {
// Subsequent chunks: prepend overlap from previous chunk's
// tail for acoustic continuity (real audio, not zeros).
size_t overlap = std::min(static_cast<size_t>(overlap_samples), offset);
chunk.resize(overlap + chunk_size);
std::copy(pcmf32_ptt.begin() + offset - overlap,
pcmf32_ptt.begin() + offset + chunk_size,
chunk.begin());
}
if (stderr_tty && offset > 0) {
fprintf(stderr, "\r[Transcribing %.1fs... chunk %zu/%zu] ",
actual_duration_s, offset / chunk_samples + 1,
(pcmf32_ptt.size() + chunk_samples - 1) / chunk_samples);
}
std::vector<BilingualSegment> chunk_results;
if (process_audio_segment(ctx, ctx_translate, ptt_params, chunk,
chunk_results, prompt_tokens) != 0) {
fprintf(stderr, "\nfailed to process audio chunk\n");
inference_failed = true;
break;
}
bilingual_results.insert(bilingual_results.end(),
chunk_results.begin(), chunk_results.end());
}
if (inference_failed) break;
// Apply hallucination filter
for (auto& seg : bilingual_results) {
if (!seg.original_text.empty())
seg.original_text = filter_hallucinations(seg.original_text);
if (!seg.english_text.empty())
seg.english_text = filter_hallucinations(seg.english_text);
}
bilingual_results.erase(
std::remove_if(bilingual_results.begin(), bilingual_results.end(),
[](const BilingualSegment& s) {
return s.original_text.empty() && s.english_text.empty();
}),
bilingual_results.end());
// Refine via Claude if enabled
if (params.refine && !bilingual_results.empty()) {
// Concatenate raw text for refinement
std::string raw_text;
for (const auto& seg : bilingual_results) {
if (!seg.original_text.empty()) raw_text += seg.original_text;
else if (!seg.english_text.empty()) raw_text += seg.english_text;
}
if (!raw_text.empty()) {
std::string refined = refine_transcription(raw_text);
// Replace all segments with a single refined segment
BilingualSegment refined_seg = bilingual_results[0];
if (!refined_seg.original_text.empty()) {
refined_seg.original_text = " " + refined;
} else {
refined_seg.english_text = " " + refined;
}
refined_seg.speaker_turn = false;
bilingual_results.clear();
bilingual_results.push_back(refined_seg);
}
}
// Display results
if (stderr_tty) fprintf(stderr, "\r \r");
if (stdout_is_tty) {
printf("\n");
}
std::ostringstream ptt_pipe_buf;
std::ostringstream* pbuf = stdout_is_tty ? nullptr : &ptt_pipe_buf;
print_bilingual_results(bilingual_results, params, auto_copy_session, export_session,
speaker_tracker, nullptr, stdout_is_tty, pbuf);
if (!stdout_is_tty) {
ptt_pipe_text += ptt_pipe_buf.str();
}
if (stdout_is_tty) {
printf("\n");
fflush(stdout);
}
// Single-shot: exit after first successful transcription
break;
}
ptt.stop();
// Dump accumulated text in pipe mode
if (!stdout_is_tty && !ptt_pipe_text.empty()) {
printf("%s\n", ptt_pipe_text.c_str());
fflush(stdout);
}
audio.pause();
// Gather final transcript text for history
std::string history_text;
if (params.meeting_mode) {
history_text = meeting_session.get_transcription();
} else if (!stdout_is_tty) {
history_text = ptt_pipe_text;
} else {
history_text = auto_copy_session.transcription_buffer.str();
}
// Finalize session
finalize_session(params, auto_copy_session, export_session, speaker_tracker,
meeting_session, meeting_output_file, history_text, auto_copy_session.start_time);
g_is_recording.store(false);
whisper_free(ctx);
if (ctx_translate) whisper_free(ctx_translate);
return 0;
}
// ─── Standard (non-PTT) mode ────────────────────────────────────────
// Clean ready state for standard mode
if (stderr_is_tty) {