diff --git a/.claude/commands/process-bookmarks.md b/.claude/commands/process-bookmarks.md
index f04faa7..fd4a3f1 100644
--- a/.claude/commands/process-bookmarks.md
+++ b/.claude/commands/process-bookmarks.md
@@ -59,11 +59,11 @@ TodoWrite({ todos: [
**CRITICAL for parallel processing:** Spawn ALL subagents in ONE message, each writing to a batch file:
```javascript
// Send ONE message with multiple Task calls - they run in parallel
-// Use model="haiku" for cost-efficient parallel processing (~50% cost savings)
+// Use model="sonnet" for high-quality summarization and insight extraction
// Each subagent writes to .state/batch-N.md, NOT to bookmarks.md!
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 0: write to .state/batch-0.md: {json for bookmarks 0-4}")
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 1: write to .state/batch-1.md: {json for bookmarks 5-9}")
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 2: write to .state/batch-2.md: {json for bookmarks 10-14}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 0: write to .state/batch-0.md: {json for bookmarks 0-4}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 1: write to .state/batch-1.md: {json for bookmarks 5-9}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 2: write to .state/batch-2.md: {json for bookmarks 10-14}")
// ... all batches in the SAME message
```
@@ -107,8 +107,8 @@ Each bookmark includes:
- `id`, `author`, `authorName`, `text`, `tweetUrl`, `date`
- `tags[]` - folder tags from bookmark folders (e.g., `["ai-tools"]`)
- `links[]` - each with `original`, `expanded`, `type`, and `content`
- - `type`: "github", "article", "video", "tweet", "media", "image"
- - `content`: extracted text, headline, author (for articles/github)
+ - `type`: "github", "article", "video", "podcast", "tweet", "media", "image"
+ - `content`: extracted text, headline, author (for articles/github); for video/podcast: `{ source: 'yt-dlp-captions'|'whisper', transcriptFile: '.state/transcripts/{id}.txt', chars: N }`
- `isReply`, `replyContext` - parent tweet info if this is a reply
- `isQuote`, `quoteContext` - quoted tweet info if this is a quote tweet
@@ -160,13 +160,15 @@ For each bookmark (or batch):
#### a. Determine the best title/summary
-Don't use generic titles like "Article" or "Tweet". Based on the content:
+Don't use generic titles like "Article" or "Tweet". The title appears after `## @author - ` and must be descriptive enough to scan in a list. Based on the content:
- GitHub repos: Use the repo name and brief description
- Articles: Use the article headline or key insight
-- Videos: Note for transcript, use tweet context
-- Quote tweets: Capture the key insight being highlighted
+- Videos/Podcasts with transcript: Use the key insight or topic from the transcript content
+- Videos/Podcasts without transcript: Note for transcript, use tweet context
+- Quote tweets: Capture the key insight being highlighted, NOT the reaction (e.g., "JC_builds: Local Calorie Estimation Model Beating GPT-4o" not "Retweet: Are you guys starting to catch on?")
- Reply threads: Include parent context in the summary
- Plain tweets: Use the key point being made
+- Image/video-only with no context: Use `@{author} - [Media] {best guess from author's bio/context}` — never just "Video post" or the raw t.co URL
#### b. Categorize using the categories config
@@ -175,7 +177,7 @@ Match each bookmark's links against category patterns (check `match` arrays). Us
**For each action type:**
- `file`: Create a separate file in the category's folder using its template
- `capture`: Just add to bookmarks.md (no separate file)
-- `transcribe`: Add to bookmarks.md with a "Needs transcript" flag, optionally create placeholder in folder
+- `transcribe`: If the link's `content` has a `transcriptFile` path, read the first 20,000 characters of that file (enough for summarization — don't read the whole file for long transcripts). Create a full knowledge file with structured summary, key takeaways, and insights, with `status: transcribed` and `transcript_source` in frontmatter. If `content` is null or has no `transcriptFile`, fall back to placeholder behavior with `status: needs_transcript`
**Special handling:**
- Quote tweets: Include quoted tweet context in entry
@@ -185,6 +187,8 @@ Match each bookmark's links against category patterns (check `match` arrays). Us
**Use the Edit tool** to insert entries into the `archiveFile` (expand `~` to home directory). NEVER use Write - it will destroy existing entries.
+**DEDUPLICATION CHECK (CRITICAL):** Before inserting ANY entry, search bookmarks.md for the tweet URL (`x.com/{author}/status/{id}`). If it already exists, SKIP it — do not create a duplicate. Log: `Skipping duplicate: @{author} {id}`.
+
**CRITICAL ordering rules for bookmarks.md:**
The file must be in **descending chronological order** (newest dates at TOP, oldest at BOTTOM).
@@ -216,9 +220,27 @@ The file must be in **descending chronological order** (newest dates at TOP, old
- **Link:** {expanded_url}
- **Tags:** [[tag1]] [[tag2]] (if bookmark has tags from folders)
- **Filed:** [{filename}](./knowledge/tools/{slug}.md) (if filed)
-- **What:** {1-2 sentence description of what this actually is}
+- **What:** {1-2 sentence synthesis — see What Field Rules below}
```
+### What Field Rules (CRITICAL — apply to every entry)
+
+The **What** field is the most important metadata. It must be a **synthesis**, not a label or echo.
+
+**Minimum bar:** 80+ characters, written as a complete sentence describing what the bookmark contains and why it matters. Descriptions under 80 characters are almost always lazy echoes or category labels — rewrite them until they actually say something.
+
+**NEVER write:**
+- Category labels: "Tool or resource share", "Commentary/perspective", "Claude Code insights/comparison"
+- Echo of the tweet text: If the tweet says "karpathy really is the fucking goat", the What must explain *what about Karpathy* — don't parrot the reaction
+- Generic placeholders: "Video content post", "Social media image bookmark", "Chart showing interesting data trend"
+- Raw URLs: Never put a t.co or any URL as the What
+
+**For quote tweets / replies:** The What MUST synthesize BOTH the author's commentary AND the quoted/parent content. The quoted content often contains the actual substance — a reaction tweet like "Are you guys starting to catch on?" is worthless without explaining what the quoted tweet actually describes.
+
+**For thin-content tweets (image-only, short with no link):** Write what you CAN infer from the author, text, and any visible context, prefixed with `THIN:` so downstream tools can flag these for manual review. Example: `THIN: @calebporzio reacting to an unspecified Chrome feature — tweet is image/video only, no text context available.`
+
+**For failed link expansion (raw t.co links):** Write `LINK_FAILED: Could not expand link from @{author} — original t.co URL did not resolve.`
+
**Tags format:** Use wiki-link style `[[TagName]]` for each tag. Only include the **Tags:** line if the bookmark has tags in its `tags` array (from folder configuration). Example: `- **Tags:** [[AI]] [[Coding]]`
**For quote tweets, include the quoted content:**
@@ -261,6 +283,12 @@ const remaining = pending.bookmarks.filter(b => !processedIds.has(b.id));
pending.bookmarks = remaining;
pending.count = remaining.length;
fs.writeFileSync(pendingPath, JSON.stringify(pending, null, 2));
+
+// Clean up transcript files for processed bookmarks
+for (const id of processedIds) {
+ const transcriptFile = path.join(path.dirname(pendingPath), 'transcripts', `${id}.txt`);
+ if (fs.existsSync(transcriptFile)) fs.unlinkSync(transcriptFile);
+}
```
### 4. Commit and Push Changes
@@ -354,6 +382,42 @@ via: "Twitter bookmark from @{author}"
### Podcast Entry (`./knowledge/podcasts/{slug}.md`)
+**When transcript content is available** (`content.source` is `'yt-dlp-captions'` or `'whisper'`):
+
+```yaml
+---
+title: "{episode_title}"
+type: podcast
+date_added: {YYYY-MM-DD}
+source: "{podcast_url}"
+show: "{show_name}"
+tags: [{relevant_tags}, {folder_tags}]
+via: "Twitter bookmark from @{author}"
+status: transcribed
+transcript_source: "{yt-dlp-captions or whisper}"
+---
+
+{Summary of the episode's key points synthesized from the transcript}
+
+## Episode Info
+
+- **Show:** {show_name}
+- **Episode:** {episode_title}
+- **Why bookmarked:** {context from tweet}
+
+## Key Takeaways
+
+- Point 1 (from transcript)
+- Point 2 (from transcript)
+
+## Links
+
+- [Episode]({podcast_url})
+- [Original Tweet]({tweet_url})
+```
+
+**When no transcript available** (`content` is null):
+
```yaml
---
title: "{episode_title}"
@@ -386,6 +450,42 @@ status: needs_transcript
### Video Entry (`./knowledge/videos/{slug}.md`)
+**When transcript content is available** (`content.source` is `'yt-dlp-captions'` or `'whisper'`):
+
+```yaml
+---
+title: "{video_title}"
+type: video
+date_added: {YYYY-MM-DD}
+source: "{video_url}"
+channel: "{channel_name}"
+tags: [{relevant_tags}, {folder_tags}]
+via: "Twitter bookmark from @{author}"
+status: transcribed
+transcript_source: "{yt-dlp-captions or whisper}"
+---
+
+{Summary of the video's key points synthesized from the transcript}
+
+## Video Info
+
+- **Channel:** {channel_name}
+- **Title:** {video_title}
+- **Why bookmarked:** {context from tweet}
+
+## Key Takeaways
+
+- Point 1 (from transcript)
+- Point 2 (from transcript)
+
+## Links
+
+- [Video]({video_url})
+- [Original Tweet]({tweet_url})
+```
+
+**When no transcript available** (`content` is null):
+
```yaml
---
title: "{video_title}"
@@ -427,10 +527,10 @@ status: needs_transcript
Spawn multiple Task subagents in ONE message. Each writes to a separate temp file:
```
-Task 1: model="haiku", "Process batch 0" → writes to .state/batch-0.md
-Task 2: model="haiku", "Process batch 1" → writes to .state/batch-1.md
-Task 3: model="haiku", "Process batch 2" → writes to .state/batch-2.md
-Task 4: model="haiku", "Process batch 3" → writes to .state/batch-3.md
+Task 1: model="sonnet", "Process batch 0" → writes to .state/batch-0.md
+Task 2: model="sonnet", "Process batch 1" → writes to .state/batch-1.md
+Task 3: model="sonnet", "Process batch 2" → writes to .state/batch-2.md
+Task 4: model="sonnet", "Process batch 3" → writes to .state/batch-3.md
```
**Subagent prompt template:**
@@ -448,9 +548,19 @@ DATE: {bookmark.date}
- **Tweet:** {url}
- **Tags:** [[tag1]] [[tag2]] (if tags exist)
-- **What:** {description}
-
-Also create knowledge files (./knowledge/tools/*.md, ./knowledge/articles/*.md) as needed.
+- **What:** {1-2 sentence synthesis, 80+ chars minimum}
+
+**What field rules (MUST follow):**
+- Write a SYNTHESIS, not a category label or echo of the tweet
+- NEVER write generic labels like "Tool or resource share" or "Commentary/perspective"
+- NEVER just echo the tweet text as the What — explain what the content IS and why it matters
+- For quote tweets: synthesize BOTH the commentary AND the quoted content (the quote often has the substance)
+- For image/video-only tweets with no text context: prefix with "THIN:" and describe what you can infer
+- For failed link expansion (raw t.co URLs): write "LINK_FAILED: Could not expand link from @{author}"
+- Minimum 80 characters. Under 80 means you're echoing or categorizing, not synthesizing. Rewrite.
+
+For video/podcast bookmarks with a transcriptFile in content, read the FIRST 20,000 characters of that file for summarization.
+Also create knowledge files (./knowledge/tools/*.md, ./knowledge/articles/*.md, ./knowledge/videos/*.md, ./knowledge/podcasts/*.md) as needed.
DO NOT touch bookmarks.md - only write to .state/batch-{N}.md
```
@@ -468,6 +578,7 @@ After ALL subagents complete:
**Merge logic for bookmarks.md:**
- File is descending order (newest dates at top)
- For each entry from batch files (processed in order):
+ - **DEDUP CHECK FIRST:** Search bookmarks.md for the tweet URL. If already present, skip.
- Find or create the date section at correct position
- Insert entry at TOP of that date section
- Since batches are oldest-first, entries end up in correct order
diff --git a/README.md b/README.md
index 7f3517a..12a090e 100644
--- a/README.md
+++ b/README.md
@@ -154,10 +154,11 @@ Categories define how different bookmark types are handled. Smaug comes with sen
| **github** | github.com | file | `./knowledge/tools/` |
| **article** | medium.com, substack.com, dev.to, blogs | file | `./knowledge/articles/` |
| **x-article** | x.com/i/article/* | file | `./knowledge/articles/` |
+| **podcast** | podcasts.apple.com, spotify.com/episode, overcast.fm | transcribe | `./knowledge/podcasts/` |
+| **youtube** | youtube.com, youtu.be | transcribe | `./knowledge/videos/` |
+| **video** | vimeo.com, loom.com | transcribe | `./knowledge/videos/` |
| **tweet** | (fallback) | capture | bookmarks.md only |
-🔜 _Note: Transcription is flagged but not yet automated. PRs welcome!_
-
### X/Twitter Long-Form Articles
X articles (`x.com/i/article/*`) are Twitter's native long-form content format. Smaug extracts the full article text using bird CLI:
@@ -177,11 +178,56 @@ Example X article bookmark:
- **What:** Deep dive into patterns from scaling CrewAI to billions of agent executions.
```
+### Video & Podcast Transcription
+
+Smaug automatically extracts transcripts from YouTube videos, podcasts, and other video content using a tiered strategy:
+
+1. **Captions (fast)**: yt-dlp downloads existing subtitles — no audio processing needed
+2. **Whisper (fallback)**: If no captions exist, yt-dlp extracts audio and Whisper transcribes it locally
+3. **Placeholder**: If neither tool is installed, bookmarks are flagged with `status: needs_transcript`
+
+Both tools are **optional** — install what you need:
+
+```bash
+# For caption extraction (recommended — handles most YouTube videos)
+pip install yt-dlp
+# or: brew install yt-dlp
+
+# For audio transcription when captions aren't available
+pip install openai-whisper
+
+# Required for Whisper (Tier 2) — ffmpeg handles audio extraction and decoding
+# Not needed if you only use caption extraction (Tier 1)
+sudo apt-get install ffmpeg # Debian/Ubuntu
+# or: brew install ffmpeg # macOS
+```
+
+**Supported platforms:** YouTube (excellent), Vimeo (good), SoundCloud (audio only), direct video URLs. Spotify and Apple Podcasts are not supported by yt-dlp — these produce placeholder files.
+
+**Configuration** (all optional, in `smaug.config.json`):
+
+```json
+{
+ "ytdlpPath": "/custom/path/to/yt-dlp",
+ "whisperPath": "/custom/path/to/whisper",
+ "whisperModel": "small.en",
+ "transcribeTimeouts": {
+ "subtitle": 30000,
+ "audio": 300000,
+ "whisper": 600000
+ }
+}
+```
+
+**How transcripts are stored:** Full transcripts are written to `.state/transcripts/{tweet_id}.txt` — they never bloat the pending JSON, even for 6-hour podcasts. During processing, Claude reads only the first ~20K characters needed for summarization.
+
+Environment variables: `YTDLP_PATH`, `WHISPER_PATH`, `WHISPER_MODEL`.
+
### Actions
- **file**: Create a separate markdown file with rich metadata
- **capture**: Add to bookmarks.md only (no separate file)
-- **transcribe**: Flag for future transcription *(auto-transcription coming soon! PRs welcome)*
+- **transcribe**: Extract transcript via yt-dlp captions or Whisper, then create a rich knowledge file. Falls back to a placeholder if tools are not installed
### Custom Categories
diff --git a/src/config.js b/src/config.js
index 1e6aac4..4ceeacb 100644
--- a/src/config.js
+++ b/src/config.js
@@ -34,6 +34,23 @@ const DEFAULT_CONFIG = {
// Path to bird CLI (if not in PATH)
birdPath: null,
+ // Path to yt-dlp CLI (if not in PATH)
+ ytdlpPath: null,
+
+ // Path to whisper CLI (if not in PATH)
+ whisperPath: null,
+
+ // Whisper model to use (default: small.en)
+ whisperModel: 'small.en',
+
+ // Transcription timeouts (ms)
+ transcribeTimeouts: {
+ subtitle: 30000, // 30s for subtitle download
+ audio: 300000, // 5min for audio extraction
+ whisper: 600000 // 10min for whisper transcription
+ },
+
+
// Twitter credentials (can also use AUTH_TOKEN and CT0 env vars)
twitter: {
authToken: null,
@@ -221,6 +238,15 @@ export function loadConfig(configPath) {
if (process.env.BIRD_PATH) {
config.birdPath = process.env.BIRD_PATH;
}
+ if (process.env.YTDLP_PATH) {
+ config.ytdlpPath = process.env.YTDLP_PATH;
+ }
+ if (process.env.WHISPER_PATH) {
+ config.whisperPath = process.env.WHISPER_PATH;
+ }
+ if (process.env.WHISPER_MODEL) {
+ config.whisperModel = process.env.WHISPER_MODEL;
+ }
if (process.env.SOURCE) {
config.source = process.env.SOURCE;
}
@@ -268,6 +294,8 @@ export function loadConfig(configPath) {
config.pendingFile = expandTilde(config.pendingFile);
config.stateFile = expandTilde(config.stateFile);
config.birdPath = expandTilde(config.birdPath);
+ config.ytdlpPath = expandTilde(config.ytdlpPath);
+ config.whisperPath = expandTilde(config.whisperPath);
config.projectRoot = expandTilde(config.projectRoot);
// Expand ~ in category folders
diff --git a/src/index.js b/src/index.js
index 6e67e08..49ae3d5 100644
--- a/src/index.js
+++ b/src/index.js
@@ -13,6 +13,11 @@ export {
fetchGitHubContent,
fetchArticleContent,
fetchXArticleContent,
+ fetchTranscriptContent,
+ parseJson3Transcript,
+ parseVttTranscript,
+ findYtDlp,
+ findWhisper,
isPaywalled,
loadState,
saveState
diff --git a/src/processor.js b/src/processor.js
index 2c1bfbc..db6a157 100644
--- a/src/processor.js
+++ b/src/processor.js
@@ -541,6 +541,244 @@ export async function fetchArticleContent(url) {
}
}
+// ---- Video/Podcast Transcript Extraction ----
+
+/**
+ * Locate yt-dlp binary. Returns path or null if not installed.
+ * Accepts options for dependency injection (testability).
+ */
+export function findYtDlp(options = {}) {
+ const execSyncFn = options.execSyncFn || execSync;
+ const configPath = options.configPath || null;
+
+ if (configPath) {
+ try {
+ if (fs.existsSync(configPath)) return configPath;
+ } catch {}
+ }
+
+ try {
+ return execSyncFn('which yt-dlp', { encoding: 'utf8', timeout: 5000, stdio: 'pipe' }).trim();
+ } catch {
+ return null;
+ }
+}
+
+/**
+ * Locate whisper binary. Returns path or null if not installed.
+ * Accepts options for dependency injection (testability).
+ */
+export function findWhisper(options = {}) {
+ const execSyncFn = options.execSyncFn || execSync;
+ const configPath = options.configPath || null;
+
+ if (configPath) {
+ try {
+ if (fs.existsSync(configPath)) return configPath;
+ } catch {}
+ }
+
+ try {
+ return execSyncFn('which whisper', { encoding: 'utf8', timeout: 5000, stdio: 'pipe' }).trim();
+ } catch {
+ return null;
+ }
+}
+
+/**
+ * Parse YouTube json3 subtitle format into plain text.
+ * json3 contains events with segs[].utf8 text segments.
+ */
+export function parseJson3Transcript(content) {
+ try {
+ const data = JSON.parse(content);
+ const events = data.events || [];
+ const lines = [];
+ let lastLine = '';
+
+ for (const event of events) {
+ if (!event.segs) continue;
+ const text = event.segs.map(s => s.utf8 || '').join('').trim();
+ if (text && text !== lastLine) {
+ lines.push(text);
+ lastLine = text;
+ }
+ }
+
+ return lines.join('\n');
+ } catch {
+ return '';
+ }
+}
+
+/**
+ * Parse VTT (WebVTT) subtitle format into plain text.
+ * Strips headers, timestamps, tags, and deduplicates adjacent lines.
+ */
+export function parseVttTranscript(content) {
+ if (!content || typeof content !== 'string') return '';
+
+ const lines = content.split('\n');
+ const textLines = [];
+ let lastLine = '';
+
+ for (const raw of lines) {
+ const line = raw.trim();
+
+ // Skip WEBVTT header, NOTE blocks, and empty lines
+ if (!line || line === 'WEBVTT' || line.startsWith('NOTE') || line.startsWith('Kind:') || line.startsWith('Language:')) continue;
+
+ // Skip timestamp lines (00:00:00.000 --> 00:00:05.000)
+ if (/^\d{2}:\d{2}/.test(line) && line.includes('-->')) continue;
+
+ // Skip cue identifiers (numeric lines before timestamps)
+ if (/^\d+$/.test(line)) continue;
+
+ // Strip VTT formatting tags: , , <00:00:01.234>, , etc.
+ let cleaned = line
+ .replace(/<\/?c[^>]*>/g, '') // color/class tags
+ .replace(/<\d{2}:\d{2}[\d:.]*>/g, '') // inline timestamps
+ .replace(/<\/?[a-z][^>]*>/g, '') // other HTML-like tags
+ .trim();
+
+ // Decode common HTML entities
+ cleaned = cleaned
+ .replace(/&/g, '&')
+ .replace(/</g, '<')
+ .replace(/>/g, '>')
+ .replace(/"/g, '"')
+ .replace(/'/g, "'")
+ .replace(/'/g, "'")
+ .replace(/ /g, ' ');
+
+ if (cleaned && cleaned !== lastLine) {
+ textLines.push(cleaned);
+ lastLine = cleaned;
+ }
+ }
+
+ return textLines.join('\n');
+}
+
+/**
+ * Fetch transcript for a video/podcast URL using tiered strategy:
+ * Tier 1: yt-dlp captions (subtitle-only, no download)
+ * Tier 2: yt-dlp audio extraction + Whisper transcription
+ * Tier 3: return null (placeholder behavior)
+ *
+ * @param {string} url - The video/podcast URL
+ * @param {object} config - App config (timeouts, whisperModel)
+ * @param {object} toolPaths - { ytdlp: string|null, whisper: string|null }
+ * @returns {Promise<{text: string, source: string}|null>}
+ */
+export async function fetchTranscriptContent(url, config, toolPaths) {
+ const { ytdlp, whisper } = toolPaths || {};
+ const timeouts = config.transcribeTimeouts || {};
+ const subtitleTimeout = timeouts.subtitle || 30000;
+ const audioTimeout = timeouts.audio || 300000;
+ const whisperTimeout = timeouts.whisper || 600000;
+ const whisperModel = config.whisperModel || 'small.en';
+
+ if (!ytdlp) {
+ console.log(' yt-dlp not available — install yt-dlp for video/podcast transcripts');
+ return null;
+ }
+
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'smaug-transcript-'));
+
+ try {
+ // ---- Tier 1: Subtitle extraction ----
+ const isYouTube = /youtube\.com|youtu\.be/.test(url);
+ const subFormat = isYouTube ? 'json3' : 'vtt/best';
+ const outputTemplate = path.join(tmpDir, '%(id)s.%(ext)s');
+
+ try {
+ execSync(
+ `"${ytdlp}" --write-subs --write-auto-subs --sub-langs "en.*,-live_chat" --sub-format "${subFormat}" --skip-download --no-warnings -o "${outputTemplate}" "${url}"`,
+ { timeout: subtitleTimeout, stdio: 'pipe', encoding: 'utf8', shell: true }
+ );
+ } catch {
+ // yt-dlp may exit non-zero even when subtitle files were written
+ // (e.g., 429 on one variant while others succeeded). Check files below.
+ }
+
+ // Check for subtitle output files (yt-dlp exits non-zero even with no subs)
+ const files = fs.readdirSync(tmpDir);
+ const subFile = files.find(f => f.endsWith('.json3')) ||
+ files.find(f => f.endsWith('.vtt')) ||
+ files.find(f => f.endsWith('.srt'));
+
+ if (subFile) {
+ const subContent = fs.readFileSync(path.join(tmpDir, subFile), 'utf8');
+ let text;
+
+ if (subFile.endsWith('.json3')) {
+ text = parseJson3Transcript(subContent);
+ } else {
+ text = parseVttTranscript(subContent);
+ }
+
+ if (text && text.length > 0) {
+ console.log(` Transcript extracted via captions (${text.length} chars)`);
+ return { text, source: 'yt-dlp-captions' };
+ }
+ }
+
+ // ---- Tier 2: Audio extraction + Whisper ----
+ if (!whisper) {
+ console.log(' No captions available — install whisper for audio transcription');
+ return null;
+ }
+
+ console.log(' No captions found, extracting audio for Whisper...');
+ const audioFile = path.join(tmpDir, 'audio.mp3');
+
+ try {
+ execSync(
+ `"${ytdlp}" -x --audio-format mp3 --audio-quality 5 -o "${audioFile}" "${url}"`,
+ { timeout: audioTimeout, stdio: 'pipe', encoding: 'utf8', shell: true }
+ );
+ } catch (err) {
+ console.log(` Audio extraction failed: ${err.message?.split('\n')[0]}`);
+ return null;
+ }
+
+ if (!fs.existsSync(audioFile)) {
+ console.log(' Audio extraction produced no output');
+ return null;
+ }
+
+ console.log(` Transcribing with Whisper (model: ${whisperModel})...`);
+ try {
+ execSync(
+ `"${whisper}" "${audioFile}" --model ${whisperModel} --output_format txt --output_dir "${tmpDir}"`,
+ { timeout: whisperTimeout, stdio: 'pipe', encoding: 'utf8', shell: true }
+ );
+ } catch (err) {
+ console.log(` Whisper transcription failed: ${err.message?.split('\n')[0]}`);
+ return null;
+ }
+
+ // Whisper outputs audio.txt
+ const txtFiles = fs.readdirSync(tmpDir).filter(f => f.endsWith('.txt'));
+ if (txtFiles.length > 0) {
+ const text = fs.readFileSync(path.join(tmpDir, txtFiles[0]), 'utf8').trim();
+ if (text.length > 0) {
+ console.log(` Transcript extracted via Whisper (${text.length} chars)`);
+ return { text, source: 'whisper' };
+ }
+ }
+
+ console.log(' Whisper produced no output');
+ return null;
+ } finally {
+ // Clean up temp directory
+ try {
+ fs.rmSync(tmpDir, { recursive: true, force: true });
+ } catch {}
+ }
+}
+
export async function fetchContent(url, type, config) {
// Use GitHub API for GitHub URLs
if (type === 'github') {
@@ -640,6 +878,13 @@ export async function fetchAndPrepareBookmarks(options = {}) {
return { bookmarks: [], count: 0 };
}
+ // Detect transcription tools once per run
+ const ytdlpPath = findYtDlp({ configPath: config.ytdlpPath });
+ const whisperPath = findWhisper({ configPath: config.whisperPath });
+ const toolPaths = { ytdlp: ytdlpPath, whisper: whisperPath };
+ if (ytdlpPath) console.log(` yt-dlp found: ${ytdlpPath}`);
+ if (whisperPath) console.log(` whisper found: ${whisperPath}`);
+
console.log(`Preparing ${toProcess.length} tweets...`);
const prepared = [];
@@ -738,6 +983,14 @@ export async function fetchAndPrepareBookmarks(options = {}) {
}
} else if (expanded.match(/\.(jpg|jpeg|png|gif|webp)$/i)) {
type = 'image';
+ } else if (
+ expanded.includes('podcasts.apple.com') ||
+ expanded.includes('spotify.com/episode') ||
+ expanded.includes('overcast.fm') ||
+ expanded.includes('pocketcasts.com') ||
+ expanded.includes('castro.fm')
+ ) {
+ type = 'podcast';
} else {
type = 'article';
}
@@ -773,6 +1026,30 @@ export async function fetchAndPrepareBookmarks(options = {}) {
}
}
+ // Fetch transcript for video and podcast links
+ if (type === 'video' || type === 'podcast') {
+ try {
+ const transcriptResult = await fetchTranscriptContent(expanded, config, toolPaths);
+ if (transcriptResult && transcriptResult.text) {
+ // Write full transcript to separate file to keep pending JSON small
+ const transcriptsDir = path.join(path.dirname(config.pendingFile), 'transcripts');
+ if (!fs.existsSync(transcriptsDir)) {
+ fs.mkdirSync(transcriptsDir, { recursive: true });
+ }
+ const transcriptFile = path.join(transcriptsDir, `${bookmark.id}.txt`);
+ fs.writeFileSync(transcriptFile, transcriptResult.text);
+ content = {
+ source: transcriptResult.source,
+ transcriptFile,
+ chars: transcriptResult.text.length
+ };
+ console.log(` Transcript: ${content.chars} chars (${content.source}) → ${transcriptFile}`);
+ }
+ } catch (error) {
+ console.log(` Transcript extraction failed: ${error.message}`);
+ }
+ }
+
links.push({
original: link,
expanded,
diff --git a/test/config.test.js b/test/config.test.js
index 8ba97fa..8f0e5ea 100644
--- a/test/config.test.js
+++ b/test/config.test.js
@@ -63,4 +63,37 @@ describe('loadConfig', () => {
// Default paths don't use ~, but the function should work
assert.ok(!config.archiveFile.includes('~'));
});
+
+ test('transcription config keys have correct defaults', () => {
+ const config = loadConfig('./nonexistent.json');
+ assert.strictEqual(config.ytdlpPath, null);
+ assert.strictEqual(config.whisperPath, null);
+ assert.strictEqual(config.whisperModel, 'small.en');
+ assert.ok(config.transcribeTimeouts);
+ assert.strictEqual(config.transcribeTimeouts.subtitle, 30000);
+ assert.strictEqual(config.transcribeTimeouts.audio, 300000);
+ assert.strictEqual(config.transcribeTimeouts.whisper, 600000);
+ });
+
+ test('env var overrides work for transcription config', () => {
+ const origYtdlp = process.env.YTDLP_PATH;
+ const origWhisper = process.env.WHISPER_PATH;
+ const origModel = process.env.WHISPER_MODEL;
+ try {
+ process.env.YTDLP_PATH = '/custom/yt-dlp';
+ process.env.WHISPER_PATH = '/custom/whisper';
+ process.env.WHISPER_MODEL = 'tiny.en';
+ const config = loadConfig('./nonexistent.json');
+ assert.strictEqual(config.ytdlpPath, '/custom/yt-dlp');
+ assert.strictEqual(config.whisperPath, '/custom/whisper');
+ assert.strictEqual(config.whisperModel, 'tiny.en');
+ } finally {
+ if (origYtdlp === undefined) delete process.env.YTDLP_PATH;
+ else process.env.YTDLP_PATH = origYtdlp;
+ if (origWhisper === undefined) delete process.env.WHISPER_PATH;
+ else process.env.WHISPER_PATH = origWhisper;
+ if (origModel === undefined) delete process.env.WHISPER_MODEL;
+ else process.env.WHISPER_MODEL = origModel;
+ }
+ });
});
diff --git a/test/fixtures/sample-subtitles.json3 b/test/fixtures/sample-subtitles.json3
new file mode 100644
index 0000000..49e577d
--- /dev/null
+++ b/test/fixtures/sample-subtitles.json3
@@ -0,0 +1,25 @@
+{
+ "events": [
+ {
+ "segs": [{"utf8": "Hello and welcome to this video."}]
+ },
+ {
+ "segs": [{"utf8": "Today we'll be talking about"}]
+ },
+ {
+ "segs": [{"utf8": "Today we'll be talking about"}]
+ },
+ {
+ "segs": [{"utf8": " artificial intelligence."}]
+ },
+ {
+ "segs": [{"utf8": "Let's dive right in."}]
+ },
+ {
+ "tStartMs": 5000
+ },
+ {
+ "segs": [{"utf8": "First, let's define what AI means."}]
+ }
+ ]
+}
diff --git a/test/fixtures/sample-subtitles.vtt b/test/fixtures/sample-subtitles.vtt
new file mode 100644
index 0000000..b7ccc0f
--- /dev/null
+++ b/test/fixtures/sample-subtitles.vtt
@@ -0,0 +1,21 @@
+WEBVTT
+Kind: captions
+Language: en
+
+00:00:00.000 --> 00:00:02.500
+Hello and welcome to this video.
+
+00:00:02.500 --> 00:00:05.000
+Today we'll be talking about
+
+00:00:05.000 --> 00:00:07.500
+<00:00:05.500>artificial intelligence.
+
+00:00:07.500 --> 00:00:10.000
+Today we'll be talking about
+
+00:00:10.000 --> 00:00:12.500
+Let's dive right in.
+
+00:00:12.500 --> 00:00:15.000
+First, let&s define what AI means.
diff --git a/test/processor.test.js b/test/processor.test.js
index c8ffce8..130f160 100644
--- a/test/processor.test.js
+++ b/test/processor.test.js
@@ -4,7 +4,16 @@ import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import { fileURLToPath } from 'url';
-import { isPaywalled, stripQuerystring, fetchXArticleContent } from '../src/processor.js';
+import {
+ isPaywalled,
+ stripQuerystring,
+ fetchXArticleContent,
+ findYtDlp,
+ findWhisper,
+ parseJson3Transcript,
+ parseVttTranscript,
+ fetchTranscriptContent
+} from '../src/processor.js';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -579,6 +588,194 @@ describe('fetchBookmarks count truncation', () => {
});
});
+// Check if yt-dlp is available for integration tests
+function hasYtDlp() {
+ try {
+ execSync('which yt-dlp', { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
+ return true;
+ } catch {
+ return false;
+ }
+}
+
+function hasWhisper() {
+ try {
+ execSync('which whisper', { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
+ return true;
+ } catch {
+ return false;
+ }
+}
+
+const YT_DLP_AVAILABLE = hasYtDlp();
+const WHISPER_AVAILABLE = hasWhisper();
+
+describe('findYtDlp', () => {
+ test('returns null when tool is not installed (mocked)', () => {
+ const result = findYtDlp({
+ execSyncFn: () => { throw new Error('not found'); },
+ configPath: null
+ });
+ assert.strictEqual(result, null);
+ });
+
+ test('returns configured path when it exists', () => {
+ const result = findYtDlp({
+ configPath: '/usr/bin/env'
+ });
+ assert.strictEqual(result, '/usr/bin/env');
+ });
+
+ test('returns null for non-existent configured path', () => {
+ const result = findYtDlp({
+ execSyncFn: () => { throw new Error('not found'); },
+ configPath: '/nonexistent/path/yt-dlp'
+ });
+ assert.strictEqual(result, null);
+ });
+});
+
+describe('findWhisper', () => {
+ test('returns null when tool is not installed (mocked)', () => {
+ const result = findWhisper({
+ execSyncFn: () => { throw new Error('not found'); },
+ configPath: null
+ });
+ assert.strictEqual(result, null);
+ });
+
+ test('returns configured path when it exists', () => {
+ const result = findWhisper({
+ configPath: '/usr/bin/env'
+ });
+ assert.strictEqual(result, '/usr/bin/env');
+ });
+});
+
+describe('parseJson3Transcript', () => {
+ test('produces clean text from fixture', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.json3'), 'utf8');
+ const result = parseJson3Transcript(content);
+ assert.ok(result.includes('Hello and welcome to this video.'));
+ assert.ok(result.includes('artificial intelligence.'));
+ assert.ok(result.includes("Let's dive right in."));
+ assert.ok(result.includes("First, let's define what AI means."));
+ });
+
+ test('deduplicates adjacent identical lines', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.json3'), 'utf8');
+ const result = parseJson3Transcript(content);
+ const lines = result.split('\n');
+ const talkingAbout = lines.filter(l => l === "Today we'll be talking about");
+ assert.strictEqual(talkingAbout.length, 1, 'duplicate adjacent lines should be removed');
+ });
+
+ test('skips events without segs', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.json3'), 'utf8');
+ const result = parseJson3Transcript(content);
+ assert.ok(!result.includes('undefined'));
+ });
+
+ test('returns empty string for malformed content', () => {
+ assert.strictEqual(parseJson3Transcript('not json'), '');
+ assert.strictEqual(parseJson3Transcript(''), '');
+ assert.strictEqual(parseJson3Transcript('{}'), '');
+ });
+});
+
+describe('parseVttTranscript', () => {
+ test('produces clean text from fixture', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.vtt'), 'utf8');
+ const result = parseVttTranscript(content);
+ assert.ok(result.includes('Hello and welcome to this video.'));
+ assert.ok(result.includes('artificial intelligence.'));
+ assert.ok(result.includes("Let's dive right in."));
+ });
+
+ test('strips VTT formatting tags', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.vtt'), 'utf8');
+ const result = parseVttTranscript(content);
+ assert.ok(!result.includes(''), 'color tags should be stripped');
+ assert.ok(!result.includes('<00:'), 'inline timestamps should be stripped');
+ });
+
+ test('decodes HTML entities', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.vtt'), 'utf8');
+ const result = parseVttTranscript(content);
+ assert.ok(result.includes('&s define'), 'should decode & to &');
+ assert.ok(result.includes("Let's"), 'should decode ' to apostrophe');
+ });
+
+ test('excludes WEBVTT header and timestamp lines', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.vtt'), 'utf8');
+ const result = parseVttTranscript(content);
+ assert.ok(!result.includes('WEBVTT'), 'header should be excluded');
+ assert.ok(!result.includes('-->'), 'timestamps should be excluded');
+ assert.ok(!result.includes('Kind:'), 'metadata should be excluded');
+ });
+
+ test('deduplicates adjacent lines', () => {
+ const content = fs.readFileSync(path.join(__dirname, 'fixtures/sample-subtitles.vtt'), 'utf8');
+ const result = parseVttTranscript(content);
+ const lines = result.split('\n');
+ for (let i = 1; i < lines.length; i++) {
+ assert.notStrictEqual(lines[i], lines[i - 1], `adjacent lines should not be identical: "${lines[i]}"`);
+ }
+ });
+
+ test('returns empty string for null/empty input', () => {
+ assert.strictEqual(parseVttTranscript(null), '');
+ assert.strictEqual(parseVttTranscript(''), '');
+ assert.strictEqual(parseVttTranscript(undefined), '');
+ });
+});
+
+describe('fetchTranscriptContent', () => {
+ test('returns null with log when no tools available', async () => {
+ const result = await fetchTranscriptContent(
+ 'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
+ { transcribeTimeouts: {} },
+ { ytdlp: null, whisper: null }
+ );
+ assert.strictEqual(result, null);
+ });
+
+ test('returns null when toolPaths is undefined', async () => {
+ const result = await fetchTranscriptContent(
+ 'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
+ {},
+ undefined
+ );
+ assert.strictEqual(result, null);
+ });
+});
+
+describe('podcast URL classification', () => {
+ test('Apple Podcasts URL is detected as podcast type', () => {
+ const url = 'https://podcasts.apple.com/us/podcast/episode-name/id123456';
+ assert.ok(url.includes('podcasts.apple.com'));
+ });
+
+ test('Spotify episode URL is detected as podcast type', () => {
+ const url = 'https://open.spotify.com/episode/abc123';
+ assert.ok(url.includes('spotify.com/episode'));
+ });
+
+ test('Overcast URL is detected as podcast type', () => {
+ const url = 'https://overcast.fm/+abc123';
+ assert.ok(url.includes('overcast.fm'));
+ });
+
+ test('YouTube URL is NOT classified as podcast', () => {
+ const url = 'https://www.youtube.com/watch?v=abc123';
+ assert.ok(!url.includes('podcasts.apple.com'));
+ assert.ok(!url.includes('spotify.com/episode'));
+ assert.ok(!url.includes('overcast.fm'));
+ assert.ok(!url.includes('pocketcasts.com'));
+ assert.ok(!url.includes('castro.fm'));
+ });
+});
+
// Integration tests - only run when bird CLI has valid credentials
describe('X article integration tests (requires bird credentials)', { skip: !BIRD_AVAILABLE }, () => {
// These tests require actual Twitter/X API access via bird CLI