From 3860aa655eace5d59933a8c4f84e34eb56e0db67 Mon Sep 17 00:00:00 2001 From: Iaminfitite Date: Sat, 25 Apr 2026 01:00:40 +0400 Subject: [PATCH] =?UTF-8?q?add=20helpers/transcribe=5Fwhisper.py=20?= =?UTF-8?q?=E2=80=94=20opt-in=20offline=20transcription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop-in alternative to transcribe.py for users who want offline, no-API-cost transcription. Uses faster-whisper with word-level timestamps and emits the same JSON shape pack_transcripts.py and downstream helpers already consume (type/text/start/end/speaker_id, with synthesized "spacing" entries to preserve silence-gap data). Positioned as opt-in, not a replacement: - faster-whisper is added under [project.optional-dependencies] as `whisper`, installed via `uv sync --extra whisper` - existing helpers/transcribe.py (ElevenLabs Scribe) remains the documented default; SKILL.md's anti-pattern note on local-CPU Whisper still applies to anyone who picks this path - no diarization, no audio-event tagging — the docstring is explicit about these caveats Co-Authored-By: Claude Opus 4.7 (1M context) --- helpers/transcribe_whisper.py | 184 ++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 185 insertions(+) create mode 100644 helpers/transcribe_whisper.py diff --git a/helpers/transcribe_whisper.py b/helpers/transcribe_whisper.py new file mode 100644 index 0000000..f40d7d2 --- /dev/null +++ b/helpers/transcribe_whisper.py @@ -0,0 +1,184 @@ +"""Transcribe a video with local faster-whisper. + +Drop-in alternative to `transcribe.py` (ElevenLabs Scribe) for offline, +no-API-cost use. Produces the same JSON shape that `pack_transcripts.py` +and downstream helpers expect: + + { + "text": "", + "language_code": "en", + "words": [ + {"type": "word", "text": "hello", "start": 0.12, "end": 0.38, "speaker_id": "speaker_0"}, + {"type": "spacing", "text": " ", "start": 0.38, "end": 0.41, "speaker_id": null}, + ... + ] + } + +Caveats vs Scribe: +- No diarization. All words get `speaker_id = "speaker_0"`. If --num-speakers + is passed, it's accepted but ignored (parity with the Scribe CLI). +- No audio-event tagging (laughter, applause, etc.) — only "word" and "spacing". +- Filler words ("um", "uh") are transcribed as regular words; downstream + filler detection still works because it operates on text. + +Usage: + python helpers/transcribe_whisper.py + python helpers/transcribe_whisper.py --edit-dir /custom/edit + python helpers/transcribe_whisper.py --language en + python helpers/transcribe_whisper.py --model large-v3 +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +from faster_whisper import WhisperModel + + +DEFAULT_MODEL = "large-v3" +DEFAULT_COMPUTE_TYPE = "int8" + + +def extract_audio(video_path: Path, dest: Path) -> None: + cmd = [ + "ffmpeg", "-y", "-i", str(video_path), + "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", + str(dest), + ] + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + +def run_whisper( + audio_path: Path, + model_name: str, + compute_type: str, + language: str | None, +) -> tuple[list[dict], str, str]: + model = WhisperModel(model_name, device="cpu", compute_type=compute_type) + segments_iter, info = model.transcribe( + str(audio_path), + language=language, + word_timestamps=True, + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 400}, + ) + + words_out: list[dict] = [] + full_text_parts: list[str] = [] + prev_end: float | None = None + + for seg in segments_iter: + for w in (seg.words or []): + if w.start is None or w.end is None: + continue + text = w.word + stripped = text.strip() + + if prev_end is not None and w.start > prev_end: + words_out.append({ + "type": "spacing", + "text": " ", + "start": prev_end, + "end": w.start, + "speaker_id": None, + }) + + words_out.append({ + "type": "word", + "text": stripped if stripped else text, + "start": w.start, + "end": w.end, + "speaker_id": "speaker_0", + }) + full_text_parts.append(stripped if stripped else text) + prev_end = w.end + + return words_out, " ".join(full_text_parts).strip(), info.language or (language or "en") + + +def transcribe_one( + video: Path, + edit_dir: Path, + model_name: str, + compute_type: str, + language: str | None = None, + verbose: bool = True, +) -> Path: + transcripts_dir = edit_dir / "transcripts" + transcripts_dir.mkdir(parents=True, exist_ok=True) + out_path = transcripts_dir / f"{video.stem}.json" + + if out_path.exists(): + if verbose: + print(f"cached: {out_path.name}") + return out_path + + if verbose: + print(f" extracting audio from {video.name}", flush=True) + + t0 = time.time() + with tempfile.TemporaryDirectory() as tmp: + audio = Path(tmp) / f"{video.stem}.wav" + extract_audio(video, audio) + if verbose: + size_mb = audio.stat().st_size / (1024 * 1024) + print(f" transcribing {video.stem}.wav ({size_mb:.1f} MB) with {model_name} [{compute_type}]", flush=True) + words, full_text, detected_lang = run_whisper(audio, model_name, compute_type, language) + + payload = { + "language_code": detected_lang, + "text": full_text, + "words": words, + } + out_path.write_text(json.dumps(payload, indent=2)) + dt = time.time() - t0 + + if verbose: + kb = out_path.stat().st_size / 1024 + word_count = sum(1 for w in words if w["type"] == "word") + print(f" saved: {out_path.name} ({kb:.1f} KB) in {dt:.1f}s") + print(f" words: {word_count} lang: {detected_lang}") + + return out_path + + +def main() -> None: + ap = argparse.ArgumentParser(description="Transcribe a video with local faster-whisper") + ap.add_argument("video", type=Path, help="Path to video file") + ap.add_argument("--edit-dir", type=Path, default=None, + help="Edit output directory (default: /edit)") + ap.add_argument("--language", type=str, default=None, + help="Optional ISO language code (e.g., 'en'). Omit to auto-detect.") + ap.add_argument("--num-speakers", type=int, default=None, + help="Accepted for CLI parity with transcribe.py; ignored (no diarization).") + ap.add_argument("--model", type=str, default=DEFAULT_MODEL, + help=f"faster-whisper model (default: {DEFAULT_MODEL}). " + "Options: tiny, base, small, medium, large-v2, large-v3, distil-large-v3.") + ap.add_argument("--compute-type", type=str, default=DEFAULT_COMPUTE_TYPE, + help=f"ctranslate2 compute type (default: {DEFAULT_COMPUTE_TYPE}). " + "int8 is fastest on CPU; float16 on GPU.") + args = ap.parse_args() + + video = args.video.resolve() + if not video.exists(): + sys.exit(f"video not found: {video}") + + edit_dir = (args.edit_dir or (video.parent / "edit")).resolve() + + transcribe_one( + video=video, + edit_dir=edit_dir, + model_name=args.model, + compute_type=args.compute_type, + language=args.language, + ) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 1221f19..ef56a0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] +whisper = ["faster-whisper>=1.2.1"] [build-system] requires = ["setuptools>=61.0"]