alexknowshtml · StartupBros · Mar 26, 2026
diff --git a/.claude/commands/process-bookmarks.md b/.claude/commands/process-bookmarks.md
@@ -59,11 +59,11 @@ TodoWrite({ todos: [
 **CRITICAL for parallel processing:** Spawn ALL subagents in ONE message, each writing to a batch file:
 ```javascript
 // Send ONE message with multiple Task calls - they run in parallel
-// Use model="haiku" for cost-efficient parallel processing (~50% cost savings)
+// Use model="sonnet" for high-quality summarization and insight extraction
 // Each subagent writes to .state/batch-N.md, NOT to bookmarks.md!
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 0: write to .state/batch-0.md: {json for bookmarks 0-4}")
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 1: write to .state/batch-1.md: {json for bookmarks 5-9}")
-Task(subagent_type="general-purpose", model="haiku", prompt="Process batch 2: write to .state/batch-2.md: {json for bookmarks 10-14}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 0: write to .state/batch-0.md: {json for bookmarks 0-4}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 1: write to .state/batch-1.md: {json for bookmarks 5-9}")
+Task(subagent_type="general-purpose", model="sonnet", prompt="Process batch 2: write to .state/batch-2.md: {json for bookmarks 10-14}")
 // ... all batches in the SAME message
 ```
 
@@ -107,8 +107,8 @@ Each bookmark includes:
 - `id`, `author`, `authorName`, `text`, `tweetUrl`, `date`
 - `tags[]` - folder tags from bookmark folders (e.g., `["ai-tools"]`)
 - `links[]` - each with `original`, `expanded`, `type`, and `content`
-  - `type`: "github", "article", "video", "tweet", "media", "image"
-  - `content`: extracted text, headline, author (for articles/github)
+  - `type`: "github", "article", "video", "podcast", "tweet", "media", "image"
+  - `content`: extracted text, headline, author (for articles/github); for video/podcast: `{ source: 'yt-dlp-captions'|'whisper', transcriptFile: '.state/transcripts/{id}.txt', chars: N }`
 - `isReply`, `replyContext` - parent tweet info if this is a reply
 - `isQuote`, `quoteContext` - quoted tweet info if this is a quote tweet
 
@@ -160,13 +160,15 @@ For each bookmark (or batch):
 
 #### a. Determine the best title/summary
 
-Don't use generic titles like "Article" or "Tweet". Based on the content:
+Don't use generic titles like "Article" or "Tweet". The title appears after `## @author - ` and must be descriptive enough to scan in a list. Based on the content:
 - GitHub repos: Use the repo name and brief description
 - Articles: Use the article headline or key insight
-- Videos: Note for transcript, use tweet context
-- Quote tweets: Capture the key insight being highlighted
+- Videos/Podcasts with transcript: Use the key insight or topic from the transcript content
+- Videos/Podcasts without transcript: Note for transcript, use tweet context
+- Quote tweets: Capture the key insight being highlighted, NOT the reaction (e.g., "JC_builds: Local Calorie Estimation Model Beating GPT-4o" not "Retweet: Are you guys starting to catch on?")
 - Reply threads: Include parent context in the summary
 - Plain tweets: Use the key point being made
+- Image/video-only with no context: Use `@{author} - [Media] {best guess from author's bio/context}` — never just "Video post" or the raw t.co URL
 
 #### b. Categorize using the categories config
 
@@ -175,7 +177,7 @@ Match each bookmark's links against category patterns (check `match` arrays). Us
 **For each action type:**
 - `file`: Create a separate file in the category's folder using its template
 - `capture`: Just add to bookmarks.md (no separate file)
-- `transcribe`: Add to bookmarks.md with a "Needs transcript" flag, optionally create placeholder in folder
+- `transcribe`: If the link's `content` has a `transcriptFile` path, read the first 20,000 characters of that file (enough for summarization — don't read the whole file for long transcripts). Create a full knowledge file with structured summary, key takeaways, and insights, with `status: transcribed` and `transcript_source` in frontmatter. If `content` is null or has no `transcriptFile`, fall back to placeholder behavior with `status: needs_transcript`
 
 **Special handling:**
 - Quote tweets: Include quoted tweet context in entry
@@ -185,6 +187,8 @@ Match each bookmark's links against category patterns (check `match` arrays). Us
 
 **Use the Edit tool** to insert entries into the `archiveFile` (expand `~` to home directory). NEVER use Write - it will destroy existing entries.
 
+**DEDUPLICATION CHECK (CRITICAL):** Before inserting ANY entry, search bookmarks.md for the tweet URL (`x.com/{author}/status/{id}`). If it already exists, SKIP it — do not create a duplicate. Log: `Skipping duplicate: @{author} {id}`.
+
 **CRITICAL ordering rules for bookmarks.md:**
 
 The file must be in **descending chronological order** (newest dates at TOP, oldest at BOTTOM).
@@ -216,9 +220,27 @@ The file must be in **descending chronological order** (newest dates at TOP, old
 - **Link:** {expanded_url}
 - **Tags:** [[tag1]] [[tag2]] (if bookmark has tags from folders)
 - **Filed:** [{filename}](./knowledge/tools/{slug}.md) (if filed)
-- **What:** {1-2 sentence description of what this actually is}
+- **What:** {1-2 sentence synthesis — see What Field Rules below}
 ```
 
+### What Field Rules (CRITICAL — apply to every entry)
+
+The **What** field is the most important metadata. It must be a **synthesis**, not a label or echo.
+
+**Minimum bar:** 80+ characters, written as a complete sentence describing what the bookmark contains and why it matters. Descriptions under 80 characters are almost always lazy echoes or category labels — rewrite them until they actually say something.
+
+**NEVER write:**
+- Category labels: "Tool or resource share", "Commentary/perspective", "Claude Code insights/comparison"
+- Echo of the tweet text: If the tweet says "karpathy really is the fucking goat", the What must explain *what about Karpathy* — don't parrot the reaction
+- Generic placeholders: "Video content post", "Social media image bookmark", "Chart showing interesting data trend"
+- Raw URLs: Never put a t.co or any URL as the What
+
+**For quote tweets / replies:** The What MUST synthesize BOTH the author's commentary AND the quoted/parent content. The quoted content often contains the actual substance — a reaction tweet like "Are you guys starting to catch on?" is worthless without explaining what the quoted tweet actually describes.
+
+**For thin-content tweets (image-only, short with no link):** Write what you CAN infer from the author, text, and any visible context, prefixed with `THIN:` so downstream tools can flag these for manual review. Example: `THIN: @calebporzio reacting to an unspecified Chrome feature — tweet is image/video only, no text context available.`
+
+**For failed link expansion (raw t.co links):** Write `LINK_FAILED: Could not expand link from @{author} — original t.co URL did not resolve.`
+
 **Tags format:** Use wiki-link style `[[TagName]]` for each tag. Only include the **Tags:** line if the bookmark has tags in its `tags` array (from folder configuration). Example: `- **Tags:** [[AI]] [[Coding]]`
 
 **For quote tweets, include the quoted content:**
@@ -261,6 +283,12 @@ const remaining = pending.bookmarks.filter(b => !processedIds.has(b.id));
 pending.bookmarks = remaining;
 pending.count = remaining.length;
 fs.writeFileSync(pendingPath, JSON.stringify(pending, null, 2));
+
+// Clean up transcript files for processed bookmarks
+for (const id of processedIds) {
+  const transcriptFile = path.join(path.dirname(pendingPath), 'transcripts', `${id}.txt`);
+  if (fs.existsSync(transcriptFile)) fs.unlinkSync(transcriptFile);
+}
 ```
 
 ### 4. Commit and Push Changes
@@ -354,6 +382,42 @@ via: "Twitter bookmark from @{author}"
 
 ### Podcast Entry (`./knowledge/podcasts/{slug}.md`)
 
+**When transcript content is available** (`content.source` is `'yt-dlp-captions'` or `'whisper'`):
+
+```yaml
+---
+title: "{episode_title}"
+type: podcast
+date_added: {YYYY-MM-DD}
+source: "{podcast_url}"
+show: "{show_name}"
+tags: [{relevant_tags}, {folder_tags}]
+via: "Twitter bookmark from @{author}"
+status: transcribed
+transcript_source: "{yt-dlp-captions or whisper}"
+---
+
+{Summary of the episode's key points synthesized from the transcript}
+
+## Episode Info
+
+- **Show:** {show_name}
+- **Episode:** {episode_title}
+- **Why bookmarked:** {context from tweet}
+
+## Key Takeaways
+
+- Point 1 (from transcript)
+- Point 2 (from transcript)
+
+## Links
+
+- [Episode]({podcast_url})
+- [Original Tweet]({tweet_url})
+```
+
+**When no transcript available** (`content` is null):
+
 ```yaml
 ---
 title: "{episode_title}"
@@ -386,6 +450,42 @@ status: needs_transcript
 
 ### Video Entry (`./knowledge/videos/{slug}.md`)
 
+**When transcript content is available** (`content.source` is `'yt-dlp-captions'` or `'whisper'`):
+
+```yaml
+---
+title: "{video_title}"
+type: video
+date_added: {YYYY-MM-DD}
+source: "{video_url}"
+channel: "{channel_name}"
+tags: [{relevant_tags}, {folder_tags}]
+via: "Twitter bookmark from @{author}"
+status: transcribed
+transcript_source: "{yt-dlp-captions or whisper}"
+---
+
+{Summary of the video's key points synthesized from the transcript}
+
+## Video Info
+
+- **Channel:** {channel_name}
+- **Title:** {video_title}
+- **Why bookmarked:** {context from tweet}
+
+## Key Takeaways
+
+- Point 1 (from transcript)
+- Point 2 (from transcript)
+
+## Links
+
+- [Video]({video_url})
+- [Original Tweet]({tweet_url})
+```
+
+**When no transcript available** (`content` is null):
+
 ```yaml
 ---
 title: "{video_title}"
@@ -427,10 +527,10 @@ status: needs_transcript
 Spawn multiple Task subagents in ONE message. Each writes to a separate temp file:
 
 ```
-Task 1: model="haiku", "Process batch 0" → writes to .state/batch-0.md
-Task 2: model="haiku", "Process batch 1" → writes to .state/batch-1.md
-Task 3: model="haiku", "Process batch 2" → writes to .state/batch-2.md
-Task 4: model="haiku", "Process batch 3" → writes to .state/batch-3.md
+Task 1: model="sonnet", "Process batch 0" → writes to .state/batch-0.md
+Task 2: model="sonnet", "Process batch 1" → writes to .state/batch-1.md
+Task 3: model="sonnet", "Process batch 2" → writes to .state/batch-2.md
+Task 4: model="sonnet", "Process batch 3" → writes to .state/batch-3.md
 ```
 
 **Subagent prompt template:**
@@ -448,9 +548,19 @@ DATE: {bookmark.date}
 
 - **Tweet:** {url}
 - **Tags:** [[tag1]] [[tag2]] (if tags exist)
-- **What:** {description}
-
-Also create knowledge files (./knowledge/tools/*.md, ./knowledge/articles/*.md) as needed.
+- **What:** {1-2 sentence synthesis, 80+ chars minimum}
+
+**What field rules (MUST follow):**
+- Write a SYNTHESIS, not a category label or echo of the tweet
+- NEVER write generic labels like "Tool or resource share" or "Commentary/perspective"
+- NEVER just echo the tweet text as the What — explain what the content IS and why it matters
+- For quote tweets: synthesize BOTH the commentary AND the quoted content (the quote often has the substance)
+- For image/video-only tweets with no text context: prefix with "THIN:" and describe what you can infer
+- For failed link expansion (raw t.co URLs): write "LINK_FAILED: Could not expand link from @{author}"
+- Minimum 80 characters. Under 80 means you're echoing or categorizing, not synthesizing. Rewrite.
+
+For video/podcast bookmarks with a transcriptFile in content, read the FIRST 20,000 characters of that file for summarization.
+Also create knowledge files (./knowledge/tools/*.md, ./knowledge/articles/*.md, ./knowledge/videos/*.md, ./knowledge/podcasts/*.md) as needed.
 DO NOT touch bookmarks.md - only write to .state/batch-{N}.md
 ```
 
@@ -468,6 +578,7 @@ After ALL subagents complete:
 **Merge logic for bookmarks.md:**
 - File is descending order (newest dates at top)
 - For each entry from batch files (processed in order):
+  - **DEDUP CHECK FIRST:** Search bookmarks.md for the tweet URL. If already present, skip.
   - Find or create the date section at correct position
   - Insert entry at TOP of that date section
 - Since batches are oldest-first, entries end up in correct order

diff --git a/README.md b/README.md
@@ -154,10 +154,11 @@ Categories define how different bookmark types are handled. Smaug comes with sen
 | **github** | github.com | file | `./knowledge/tools/` |
 | **article** | medium.com, substack.com, dev.to, blogs | file | `./knowledge/articles/` |
 | **x-article** | x.com/i/article/* | file | `./knowledge/articles/` |
+| **podcast** | podcasts.apple.com, spotify.com/episode, overcast.fm | transcribe | `./knowledge/podcasts/` |
+| **youtube** | youtube.com, youtu.be | transcribe | `./knowledge/videos/` |
+| **video** | vimeo.com, loom.com | transcribe | `./knowledge/videos/` |
 | **tweet** | (fallback) | capture | bookmarks.md only |
 
-🔜 _Note: Transcription is flagged but not yet automated. PRs welcome!_
-
 ### X/Twitter Long-Form Articles
 
 X articles (`x.com/i/article/*`) are Twitter's native long-form content format. Smaug extracts the full article text using bird CLI:
@@ -177,11 +178,56 @@ Example X article bookmark:
 - **What:** Deep dive into patterns from scaling CrewAI to billions of agent executions.
 ```
 
+### Video & Podcast Transcription
+
+Smaug automatically extracts transcripts from YouTube videos, podcasts, and other video content using a tiered strategy:
+
+1. **Captions (fast)**: yt-dlp downloads existing subtitles — no audio processing needed
+2. **Whisper (fallback)**: If no captions exist, yt-dlp extracts audio and Whisper transcribes it locally
+3. **Placeholder**: If neither tool is installed, bookmarks are flagged with `status: needs_transcript`
+
+Both tools are **optional** — install what you need:
+
+```bash
+# For caption extraction (recommended — handles most YouTube videos)
+pip install yt-dlp
+# or: brew install yt-dlp
+
+# For audio transcription when captions aren't available
+pip install openai-whisper
+
+# Required for Whisper (Tier 2) — ffmpeg handles audio extraction and decoding
+# Not needed if you only use caption extraction (Tier 1)
+sudo apt-get install ffmpeg   # Debian/Ubuntu
+# or: brew install ffmpeg     # macOS
+```
+
+**Supported platforms:** YouTube (excellent), Vimeo (good), SoundCloud (audio only), direct video URLs. Spotify and Apple Podcasts are not supported by yt-dlp — these produce placeholder files.
+
+**Configuration** (all optional, in `smaug.config.json`):
+
+```json
+{
+  "ytdlpPath": "/custom/path/to/yt-dlp",
+  "whisperPath": "/custom/path/to/whisper",
+  "whisperModel": "small.en",
+  "transcribeTimeouts": {
+    "subtitle": 30000,
+    "audio": 300000,
+    "whisper": 600000
+  }
+}
+```
+
+**How transcripts are stored:** Full transcripts are written to `.state/transcripts/{tweet_id}.txt` — they never bloat the pending JSON, even for 6-hour podcasts. During processing, Claude reads only the first ~20K characters needed for summarization.
+
+Environment variables: `YTDLP_PATH`, `WHISPER_PATH`, `WHISPER_MODEL`.
+
 ### Actions
 
 - **file**: Create a separate markdown file with rich metadata
 - **capture**: Add to bookmarks.md only (no separate file)
-- **transcribe**: Flag for future transcription *(auto-transcription coming soon! PRs welcome)*
+- **transcribe**: Extract transcript via yt-dlp captions or Whisper, then create a rich knowledge file. Falls back to a placeholder if tools are not installed
 
 ### Custom Categories
 

diff --git a/src/config.js b/src/config.js
@@ -34,6 +34,23 @@ const DEFAULT_CONFIG = {
   // Path to bird CLI (if not in PATH)
   birdPath: null,
 
+  // Path to yt-dlp CLI (if not in PATH)
+  ytdlpPath: null,
+
+  // Path to whisper CLI (if not in PATH)
+  whisperPath: null,
+
+  // Whisper model to use (default: small.en)
+  whisperModel: 'small.en',
+
+  // Transcription timeouts (ms)
+  transcribeTimeouts: {
+    subtitle: 30000,    // 30s for subtitle download
+    audio: 300000,      // 5min for audio extraction
+    whisper: 600000     // 10min for whisper transcription
+  },
+
+
   // Twitter credentials (can also use AUTH_TOKEN and CT0 env vars)
   twitter: {
     authToken: null,
@@ -221,6 +238,15 @@ export function loadConfig(configPath) {
   if (process.env.BIRD_PATH) {
     config.birdPath = process.env.BIRD_PATH;
   }
+  if (process.env.YTDLP_PATH) {
+    config.ytdlpPath = process.env.YTDLP_PATH;
+  }
+  if (process.env.WHISPER_PATH) {
+    config.whisperPath = process.env.WHISPER_PATH;
+  }
+  if (process.env.WHISPER_MODEL) {
+    config.whisperModel = process.env.WHISPER_MODEL;
+  }
   if (process.env.SOURCE) {
     config.source = process.env.SOURCE;
   }
@@ -268,6 +294,8 @@ export function loadConfig(configPath) {
   config.pendingFile = expandTilde(config.pendingFile);
   config.stateFile = expandTilde(config.stateFile);
   config.birdPath = expandTilde(config.birdPath);
+  config.ytdlpPath = expandTilde(config.ytdlpPath);
+  config.whisperPath = expandTilde(config.whisperPath);
   config.projectRoot = expandTilde(config.projectRoot);
 
   // Expand ~ in category folders

diff --git a/src/index.js b/src/index.js
@@ -13,6 +13,11 @@ export {
   fetchGitHubContent,
   fetchArticleContent,
   fetchXArticleContent,
+  fetchTranscriptContent,
+  parseJson3Transcript,
+  parseVttTranscript,
+  findYtDlp,
+  findWhisper,
   isPaywalled,
   loadState,
   saveState