diff --git a/.env.example b/.env.example
new file mode 100644
index 000000000..807972bbb
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+# Copy this file to .env and add your actual API key
+GEMINI_API_KEY=your-gemini-api-key-here
+GOOGLE_API_KEY
+YOUTUBE_API_KEY
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..e00f8ee7f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,186 @@
+# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
+
+# Logs
+
+logs
+_.log
+npm-debug.log_
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Caches
+
+.cache
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+
+# Runtime data
+
+pids
+_.pid
+_.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+
+lib-cov
+
+# Coverage directory used by tools like istanbul
+
+coverage
+*.lcov
+
+# nyc test coverage
+
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+
+bower_components
+
+# node-waf configuration
+
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+
+build/Release
+
+# Dependency directories
+
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+
+web_modules/
+
+# TypeScript cache
+
+*.tsbuildinfo
+
+# Optional npm cache directory
+
+.npm
+
+# Optional eslint cache
+
+.eslintcache
+
+# Optional stylelint cache
+
+.stylelintcache
+
+# Microbundle cache
+
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+
+.node_repl_history
+
+# Output of 'npm pack'
+
+*.tgz
+
+# Yarn Integrity file
+
+.yarn-integrity
+
+# dotenv environment variable files
+
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+
+.parcel-cache
+
+# Next.js build output
+
+.next
+out
+
+# Nuxt.js build / generate output
+
+.nuxt
+dist
+
+# Gatsby files
+
+# Comment in the public line in if your project uses Gatsby and not Next.js
+
+# https://nextjs.org/blog/next-9-1#public-directory-support
+
+# public
+
+# vuepress build output
+
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+
+.temp
+
+# Docusaurus cache and generated files
+
+.docusaurus
+
+# Serverless directories
+
+.serverless/
+
+# FuseBox cache
+
+.fusebox/
+
+# DynamoDB Local files
+
+.dynamodb/
+
+# TernJS port file
+
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+
+.vscode-test
+
+# yarn v2
+
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+
+# IntelliJ based IDEs
+.idea
+
+# Finder (MacOS) folder config
+.DS_Store
+
+.vscode/*
+.env
+
+src/__pycache__/*
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+venv/
+.venv/
diff --git a/README.md b/README.md
index 1bc06dbb8..0d1b4e8e4 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,98 @@
-# Agentic AI App Hackathon Template
+# YouTube Podcast Summarizer - Agentic AI Hackathon Submission
-Welcome! This repository is your starting point for the **Agentic AI App Hackathon**. It includes:
+start backend:
-- A consistent folder structure
-- An environment spec (`environment.yml` or `Dockerfile`)
-- Documentation placeholders to explain your design and demo
+```
+
+```
+
+An intelligent agent that summarizes YouTube podcast transcripts using Google Gemini AI, demonstrating a complete agentic AI architecture with planning, execution, and memory components.
+
+## π― Hackathon Requirements
+
+β **Google Gemini API Integration**: Core AI engine for intelligent summarization
+β **Agentic Architecture**: Modular design with Planner, Executor, and Memory
+β **ReAct Pattern**: Reasoning + Acting workflow demonstrated
+β **Tool Integration**: YouTube Transcript API + Gemini API
+β **Full Observability**: Complete memory logging of all agent decisions
+β **Complete Documentation**: Architecture, explanation, and demo included
## π Submission Checklist
-- [ ] All code in `src/` runs without errors
-- [ ] `ARCHITECTURE.md` contains a clear diagram sketch and explanation
-- [ ] `EXPLANATION.md` covers planning, tool use, memory, and limitations
-- [ ] `DEMO.md` links to a 3β5 min video with timestamped highlights
+- [x] All code in `src/` runs without errors
+- [x] `ARCHITECTURE.md` contains a clear diagram sketch and explanation
+- [x] `EXPLANATION.md` covers planning, tool use, memory, and limitations
+- [ ] `DEMO.md` links to a 3β5 min video with timestamped highlights
+
+## ποΈ Architecture
+This agent follows a clean agentic AI pattern with three core modules:
+
+```
+User Input β PLANNER β EXECUTOR β MEMORY
+ β β β
+ Sub-tasks Tool Calls Logging
+ β
+ [YouTube API]
+ [Gemini API]
+```
+
+### Core Components
+
+1. **Planner** (`planner.py`) - Breaks down user goal into executable sub-tasks
+2. **Executor** (`executor.py`) - Executes tasks using appropriate tools
+3. **Memory** (`memory.py`) - Logs all agent activities and decisions
## π Getting Started
-1. **Clone / Fork** this template. Very Important. Fork Name MUST be the same name as the teamn name
+### Prerequisites
+- Python 3.8+
+- Google Gemini API key ([Get one free](https://makersuite.google.com/app/apikey))
+
+
+### Installation
+
+1. **Install dependencies**
+ ```powershell
+ pip install -r requirements.txt
+ ```
+
+2. **Set up your Gemini API key**
+ ```powershell
+ $env:GEMINI_API_KEY='your-gemini-api-key-here'
+ ```
+
+3. **Run the application**
+ ```powershell
+ cd src
+ streamlit run app.py
+ ```
+
+## π How It Works
+
+### 1. Planning Phase (ReAct: Reasoning)
+The **Planner** creates an execution plan with sub-tasks
+
+### 2. Execution Phase (ReAct: Acting)
+The **Executor** runs each task using external tools
+
+### 3. Memory & Observability
+The **Memory** component logs all agent decisions
+
+## π Project Structure
+
+```
+src/
+βββ planner.py # Planning module
+βββ executor.py # Execution module
+βββ memory.py # Memory module
+βββ youtube_summarizer.py # Main agent
+βββ app.py # Streamlit UI
+ARCHITECTURE.md # System design
+EXPLANATION.md # Technical explanation
+DEMO.md # Video demo
+```
## π Folder Layout
@@ -27,16 +102,16 @@ Welcome! This repository is your starting point for the **Agentic AI App Hackath
## π Judging Criteria
-- **Technical Excellence **
+- **Technical Excellence **
This criterion evaluates the robustness, functionality, and overall quality of the technical implementation. Judges will assess the code's efficiency, the absence of critical bugs, and the successful execution of the project's core features.
-- **Solution Architecture & Documentation **
+- **Solution Architecture & Documentation **
This focuses on the clarity, maintainability, and thoughtful design of the project's architecture. This includes assessing the organization and readability of the codebase, as well as the comprehensiveness and conciseness of documentation (e.g., GitHub README, inline comments) that enables others to understand and potentially reproduce or extend the solution.
-- **Innovative Gemini Integration **
+- **Innovative Gemini Integration **
This criterion specifically assesses how effectively and creatively the Google Gemini API has been incorporated into the solution. Judges will look for novel applications, efficient use of Gemini's capabilities, and the impact it has on the project's functionality or user experience. You are welcome to use additional Google products.
-- **Societal Impact & Novelty **
+- **Societal Impact & Novelty **
This evaluates the project's potential to address a meaningful problem, contribute positively to society, or offer a genuinely innovative and unique solution. Judges will consider the originality of the idea, its potential realβworld applicability, and its ability to solve a challenge in a new or impactful way.
diff --git a/YOUTUBE_API_SETUP.md b/YOUTUBE_API_SETUP.md
new file mode 100644
index 000000000..23348470d
--- /dev/null
+++ b/YOUTUBE_API_SETUP.md
@@ -0,0 +1,180 @@
+# π YouTube API Setup Guide
+
+Get **real trending videos** from YouTube instead of sample data!
+
+---
+
+## π Quick Start (3 Steps)
+
+### 1οΈβ£ Get YouTube API Key
+
+**Go to:** https://console.cloud.google.com/
+
+1. **Create a Project**
+ - Click "Select a Project" β "New Project"
+ - Name: "PodVibe" (or your choice)
+ - Click "Create"
+
+2. **Enable YouTube Data API v3**
+ - Go to: https://console.cloud.google.com/apis/library
+ - Search: "YouTube Data API v3"
+ - Click it β Click "Enable"
+
+3. **Create API Key**
+ - Go to: https://console.cloud.google.com/apis/credentials
+ - Click "+ CREATE CREDENTIALS"
+ - Select "API Key"
+ - **Copy the key!** (looks like: `AIza...`)
+
+4. **Optional: Restrict the Key** (recommended for security)
+ - Click "Edit API Key"
+ - Under "API restrictions" β "Restrict key"
+ - Check only: "YouTube Data API v3"
+ - Save
+
+---
+
+### 2οΈβ£ Configure Your Project
+
+**Option A: Use the setup script (easiest)**
+
+```bash
+cd /Users/ishansingh/Downloads/PodVibe.fm
+./setup_youtube_api.sh
+```
+
+**Option B: Manual setup**
+
+Create `src/.env` file:
+
+```bash
+cd src
+nano .env
+```
+
+Add your keys:
+
+```env
+# Required: Google Gemini API Key
+GEMINI_API_KEY=your_gemini_key_here
+
+# Optional: YouTube Data API Key (for real trending videos)
+YOUTUBE_API_KEY=AIzaSy...your_youtube_key_here
+```
+
+---
+
+### 3οΈβ£ Start the Backend
+
+```bash
+cd src
+python3 api.py
+```
+
+You should see:
+```
+π Starting PodVibe.fm API...
+β GEMINI_API_KEY found
+β Using YouTube Data API for real trending videos
+* Running on http://0.0.0.0:8000
+```
+
+**Refresh your browser** β Real trending videos! π
+
+---
+
+## π° API Quotas (Free Tier)
+
+YouTube Data API v3 includes **10,000 quota units/day** for free:
+
+| Action | Cost | Your Usage |
+|--------|------|------------|
+| Search request | 100 units | 400 units/page load |
+| **Daily capacity** | - | **~25 page loads/day** |
+
+Perfect for development and demos!
+
+---
+
+## β What You Get
+
+### With YouTube API Key:
+- β **Real trending videos** (updated live)
+- β Actual view counts and popularity
+- β Recent uploads (last 30 days)
+- β Accurate metadata and descriptions
+- β High-quality thumbnails
+
+### Without YouTube API Key:
+- β οΈ Sample/demo data only
+- Static videos (not updated)
+- Still works for testing!
+
+---
+
+## π§ Troubleshooting
+
+### "API key not valid" error
+
+**Check:**
+1. YouTube Data API v3 is enabled in your project
+2. No typos in your API key
+3. API key restrictions allow YouTube Data API v3
+
+### "Quota exceeded" error
+
+**You've used your daily quota (10,000 units).**
+
+Solutions:
+- Wait until tomorrow (quota resets at midnight Pacific Time)
+- Request quota increase (if you need more)
+- Backend automatically falls back to sample data
+
+### Backend shows "using sample data"
+
+**The API key isn't being detected.**
+
+Check:
+```bash
+cd src
+cat .env | grep YOUTUBE_API_KEY
+```
+
+Should show: `YOUTUBE_API_KEY=AIza...`
+
+---
+
+## π― Testing Your Setup
+
+```bash
+# Test the trending endpoint
+curl http://localhost:8000/api/trending
+
+# Should return JSON with:
+# - 4 categories
+# - 3 videos each
+# - Real YouTube video IDs
+```
+
+---
+
+## π Next Steps
+
+1. β YouTube API configured
+2. Start backend: `python3 api.py`
+3. Open: http://localhost:3000
+4. Browse real trending videos!
+5. Click "AI Summarize" to get insights
+
+---
+
+## π Additional Resources
+
+- [YouTube Data API Docs](https://developers.google.com/youtube/v3)
+- [Google Cloud Console](https://console.cloud.google.com/)
+- [API Key Best Practices](https://cloud.google.com/docs/authentication/api-keys)
+
+---
+
+**Questions?** Check the main README.md or create an issue on GitHub!
+
diff --git a/__pycache__/podcast_api_example.cpython-312.pyc b/__pycache__/podcast_api_example.cpython-312.pyc
new file mode 100644
index 000000000..f7a27c783
Binary files /dev/null and b/__pycache__/podcast_api_example.cpython-312.pyc differ
diff --git a/bun.lockb b/bun.lockb
new file mode 100755
index 000000000..ef6f9ee52
Binary files /dev/null and b/bun.lockb differ
diff --git a/frontend/README.md b/frontend/README.md
new file mode 100644
index 000000000..76959c7fc
--- /dev/null
+++ b/frontend/README.md
@@ -0,0 +1,154 @@
+# YouTube Podcast Summarizer - React UI
+
+Modern, responsive React frontend for the YouTube Podcast Summarizer with Agentic AI.
+
+## Features
+
+- π¨ Beautiful gradient UI with smooth animations
+- π± Fully responsive design (mobile, tablet, desktop)
+- β‘ Real-time processing status updates
+- π Statistics and metrics display
+- πΎ Download summaries as JSON
+- π§ Visualizes Agentic AI architecture (Planner, Executor, Memory)
+
+## Tech Stack
+
+- **React 18** - UI framework
+- **Vite** - Build tool and dev server
+- **Axios** - HTTP client for API calls
+- **Lucide React** - Beautiful icon library
+- **CSS3** - Custom styling with gradients and animations
+
+## Installation
+
+1. Navigate to the frontend directory:
+```bash
+cd frontend
+```
+
+2. Install dependencies:
+```bash
+npm install
+```
+
+## Running the Application
+
+### Step 1: Start the Python Backend API
+
+In the `src` directory:
+
+```bash
+# Install Flask dependencies first
+pip install flask flask-cors
+
+# Start the API server
+python api.py
+```
+
+The API will run on `http://localhost:8000`
+
+### Step 2: Start the React Frontend
+
+In the `frontend` directory:
+
+```bash
+npm run dev
+```
+
+The React app will run on `http://localhost:3000`
+
+### Step 3: Use the Application
+
+1. Open your browser to `http://localhost:3000`
+2. Enter a YouTube podcast URL
+3. Click "Summarize"
+4. Watch the AI process your request in real-time
+5. View the comprehensive summary and download as JSON
+
+## API Endpoints
+
+The React app connects to these backend endpoints:
+
+- `POST /api/summarize` - Process YouTube video and generate summary
+- `GET /api/health` - Check API health status
+- `GET /api/models` - List available Gemini models
+
+## Project Structure
+
+```
+frontend/
+βββ src/
+β βββ App.jsx # Main React component
+β βββ App.css # Styling
+β βββ main.jsx # React entry point
+β βββ index.css # Global styles
+βββ index.html # HTML template
+βββ package.json # Dependencies
+βββ vite.config.js # Vite configuration
+```
+
+## Build for Production
+
+```bash
+npm run build
+```
+
+The optimized production build will be in the `dist/` directory.
+
+## Clip splicer (FFmpeg helper)
+
+This repo includes a bun-based clip splicer CLI at `../src/utils/clip-splicer.ts`.
+
+From the `frontend/` directory:
+
+```bash
+# requires bun installed
+npm run clip -- --help
+```
+
+## Environment Variables
+
+Make sure your `.env` file in the parent directory contains:
+
+```
+GEMINI_API_KEY=your_api_key_here
+```
+
+## Troubleshooting
+
+### CORS Issues
+If you encounter CORS errors, make sure:
+1. The Flask API is running with `flask-cors` installed
+2. The proxy configuration in `vite.config.js` is correct
+
+### API Connection Failed
+Ensure:
+1. The Flask API is running on port 8000
+2. Your `GEMINI_API_KEY` is set in the `.env` file
+3. All Python dependencies are installed
+
+### Port Already in Use
+If port 3000 or 8000 is in use:
+```bash
+# Change the port in vite.config.js for frontend
+# Change the port in api.py for backend
+```
+
+## Screenshots
+
+### Main Interface
+Clean, modern interface with YouTube URL input
+
+### Processing View
+Real-time status updates showing AI agent progress
+
+### Results Display
+Comprehensive summary with statistics and architecture visualization
+
+## Contributing
+
+This project is part of the Agentic AI Hackathon submission.
+
+## License
+
+MIT License - See LICENSE file for details
diff --git a/frontend/index.html b/frontend/index.html
new file mode 100644
index 000000000..a22993921
--- /dev/null
+++ b/frontend/index.html
@@ -0,0 +1,33 @@
+
+
+
+
+ );
+};
+
+export default LandingPage;
diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json
new file mode 100644
index 000000000..05d928ce5
--- /dev/null
+++ b/frontend/tsconfig.json
@@ -0,0 +1,27 @@
+{
+ "compilerOptions": {
+ "target": "ESNext",
+ "useDefineForClassFields": true,
+ "lib": ["DOM", "DOM.Iterable", "ESNext"],
+ "allowJs": false,
+ "skipLibCheck": true,
+ "esModuleInterop": true,
+ "allowSyntheticDefaultImports": true,
+ "strict": true,
+ "forceConsistentCasingInFileNames": true,
+ "module": "ESNext",
+ "moduleResolution": "bundler",
+ "resolveJsonModule": true,
+ "isolatedModules": true,
+ "noEmit": true,
+ "jsx": "react-jsx",
+ "baseUrl": ".",
+ "paths": {
+ "@/*": ["./src/*"],
+ "@assets/*": ["./src/assets/*"]
+ },
+ "types": ["vite/client", "node"]
+ },
+ "include": ["src"],
+ "references": [{ "path": "./tsconfig.node.json" }]
+}
diff --git a/frontend/tsconfig.node.json b/frontend/tsconfig.node.json
new file mode 100644
index 000000000..97ede7ee6
--- /dev/null
+++ b/frontend/tsconfig.node.json
@@ -0,0 +1,11 @@
+{
+ "compilerOptions": {
+ "composite": true,
+ "skipLibCheck": true,
+ "module": "ESNext",
+ "moduleResolution": "bundler",
+ "allowSyntheticDefaultImports": true,
+ "strict": true
+ },
+ "include": ["vite.config.ts"]
+}
diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts
new file mode 100644
index 000000000..d4cdf73d0
--- /dev/null
+++ b/frontend/vite.config.ts
@@ -0,0 +1,23 @@
+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+import tailwindcss from '@tailwindcss/vite'
+import path from 'path'
+
+export default defineConfig({
+ plugins: [react(), tailwindcss()],
+ resolve: {
+ alias: {
+ '@': path.resolve(__dirname, './src'),
+ '@assets': path.resolve(__dirname, './src/assets'),
+ },
+ },
+ server: {
+ port: 3000,
+ proxy: {
+ '/api': {
+ target: 'http://localhost:8000',
+ changeOrigin: true,
+ }
+ }
+ }
+})
diff --git a/gemini_api_quickstart.py b/gemini_api_quickstart.py
new file mode 100644
index 000000000..3fc06181d
--- /dev/null
+++ b/gemini_api_quickstart.py
@@ -0,0 +1,13 @@
+from google import genai
+
+# The client gets the API key from the environment variable `GEMINI_API_KEY`.
+client = genai.Client()
+
+response = client.models.generate_content(
+ model="gemini-2.5-flash", contents="Explain how AI works in a few words"
+)
+print(response.text)
+
+
+# ffmpeg command to cut a segment of an audio file with timestamp + offset
+# ffmpeg -ss 60 -i input.mp3 -t 120 -c copy output.mp3
diff --git a/idea-guy-overview-12-12-25.md b/idea-guy-overview-12-12-25.md
new file mode 100644
index 000000000..99263ce85
--- /dev/null
+++ b/idea-guy-overview-12-12-25.md
@@ -0,0 +1,133 @@
+Subsystem 1: Text to 80 20 insights (partially done)
+
+Think of β80 20 insightβ as a typed object, not just a paragraph. If you treat it like structured data, everything downstream becomes easier.
+
+An insight object should include:
+
+β’ claim: one sentence takeaway
+β’ why it matters: one sentence
+β’ evidence: short quote from transcript (verbatim)
+β’ who said it: speaker label if available
+β’ timestamp start, timestamp end (filled later)
+β’ tags: topic, domain, vibe (tactical, philosophical, contrarian, story)
+β’ actionability score: 0 to 1
+β’ novelty score: 0 to 1
+β’ βclipabilityβ score: 0 to 1 (does it stand alone, does it have a punchy ending)
+
+How to generate them reliably:
+
+A. First pass: extract candidate moments
+You prompt the model to find 20 to 60 candidate βmomentsβ from the transcript, each with a short quote and a reason. The key is you force quotes so the model cannot hallucinate content.
+
+B. Second pass: compress and rank
+You run a second prompt that selects the top K moments by your product goal (viral, useful, dense) and rewrites each into the structured insight object above.
+
+C. Third pass: sanity checks
+Reject anything that has no supporting quote, has a quote that is too long, or is too dependent on earlier context.
+
+Why this matters: downstream retrieval gets dramatically better if you index insight objects as well as raw transcript chunks.
+
+
+Subsystem 2: Insights to timestamps (alignment)
+
+If you start from transcript text, you still need precise time ranges to cut clips.
+
+You have three viable approaches:
+ 1. Word level timestamps during transcription
+Best for MVP. Many speech to text systems can give word or segment timestamps. Then βevidence quoteβ can be matched back to the segment that contains it.
+ 2. Text to audio forced alignment
+More precise, more work. Useful later when you need frame accurate clips.
+ 3. Embedding based fuzzy alignment
+Fast, imperfect. You embed the quote and search over time chunk embeddings to find the best matching window.
+
+For hackathon: do option 1, plus a fuzzy fallback (option 3) for when quotes cross segment boundaries.
+
+Alignment rule of thumb that prevents ugly clips:
+
+β’ pick boundaries on sentence ends
+β’ try to start on a speaker turn boundary
+β’ add a small lead in and tail buffer so it sounds natural (but keep total length under your target)
+
+
+Subsystem 3: Clip cutter
+
+This is basically βffmpeg with taste.β
+
+Inputs:
+
+β’ audio or video file
+β’ start time, end time
+β’ optional: loudness normalization, optional: subtitles burned in later
+
+Outputs:
+
+β’ audio clip (mp3 or wav)
+β’ optional: video clip (mp4)
+β’ metadata: clip id, episode id, timestamps, speakers, transcript snippet
+
+Two extra upgrades that make clips feel pro:
+
+
+A. Voice activity detection (VAD) trimming
+Snip dead air at the beginning and end while staying inside your intended time window.
+
+B. Loudness normalization
+So a quiet podcast and a loud podcast do not whiplash the user.
+
+
+
+A. Voice activity detection (VAD) trimming
+Snip dead air at the beginning and end while staying inside your intended time window.
+
+B. Loudness normalization
+So a quiet podcast and a loud podcast do not whiplash the user.
+
+
+
+====
+
+Goal: one question, one episode, three clips, interactive voice loop.
+
+Step 1: Ingest a single episode
+Transcribe with timestamps, chunk it, store chunks.
+
+Step 2: Generate insight objects with quotes
+Store them and index them.
+
+Step 3: Implement clip cutting
+Given start and end, return an audio clip.
+
+Step 4: Voice question to clip
+Speech to text, retrieve top insight, map to timestamps, play clip.
+
+Step 5: Guide loop
+After clip plays, guide summarizes and offers next two clip options based on adjacent insights or related chunks.
+
+If you nail this loop, scaling to many podcasts is mostly infrastructure and cost control.
+
+
+ways this can fail, and how to preempt it
+ 1. The model invents insights that are not in the audio
+Fix: force verbatim quotes, reject anything without a quote match.
+ 2. The chosen clip needs earlier context
+Fix: clipability score, and boundary rules (start at a question, or start at the beginning of a thought).
+ 3. Retrieval returns βsameyβ clips
+Fix: enforce diversity. Penalize clips from the same time neighborhood, and require different tags for the options.
+ 4. Latency kills the magic
+Fix: preprocess transcripts, chunks, and embeddings offline. Only do retrieval, rerank, and cutting at query time.
+ 5. Content rights and platform constraints
+Fix: MVP can operate on user supplied episodes or creator opt in. Public viral discovery is a separate policy and licensing project.
+
+A clean mental model to keep you oriented
+
+Treat your system like a DJ set for ideas:
+
+β’ the transcript is the full record crate
+β’ insight objects are your curated βbest loopsβ
+β’ retrieval is finding the right record
+β’ clip cutting is cueing and beat matching
+β’ the guide is the DJ talking between tracks and taking requests
+
+Build the smallest version of that DJ that can take one voice request and play three great clips in a row. Everything else is scaling.
+
+If you want, I can also draft the exact tool interface for the guide (search, get clip, summarize, propose next) and the prompts for the insight extractor and the reranker, all in a way that is hard for the model to hallucinate.
diff --git a/images/folder-githb.png b/images/folder-githb.png
deleted file mode 100644
index cd7d8d5d5..000000000
Binary files a/images/folder-githb.png and /dev/null differ
diff --git a/index.html b/index.html
new file mode 100644
index 000000000..dbea9443b
--- /dev/null
+++ b/index.html
@@ -0,0 +1,12 @@
+
+
+
+
+
+ PodVibe.fm
+
+
+
+
+
+
diff --git a/index.ts b/index.ts
new file mode 100644
index 000000000..f67b2c645
--- /dev/null
+++ b/index.ts
@@ -0,0 +1 @@
+console.log("Hello via Bun!");
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 000000000..b21af69c5
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,6 @@
+{
+ "name": "PodVibe.fm",
+ "lockfileVersion": 3,
+ "requires": true,
+ "packages": {}
+}
diff --git a/podcast_api_example.py b/podcast_api_example.py
new file mode 100644
index 000000000..d62c6b044
--- /dev/null
+++ b/podcast_api_example.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Download and transcribe NYT's "The Daily" podcast using Google Gemini API.
+
+API Key Required:
+ GOOGLE_API_KEY - Get from https://aistudio.google.com/apikey
+
+Usage:
+ # Transcribe latest episode
+ GOOGLE_API_KEY='your-key' python podcast_api_example.py
+
+ # Transcribe last 3 episodes
+ GOOGLE_API_KEY='your-key' python podcast_api_example.py --episode-count 3
+
+ # Use local Whisper instead (no API key needed)
+ python podcast_api_example.py --engine whisper
+"""
+import argparse
+import json
+import os
+import pathlib
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from datetime import datetime
+from email.utils import parsedate_to_datetime
+from typing import List, Optional
+
+import requests
+
+# NYT The Daily RSS Feed
+THE_DAILY_RSS = "https://feeds.simplecast.com/Sl5CSM3S"
+
+# iTunes podcast namespace
+ITUNES_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
+
+
+@dataclass
+class Episode:
+ """Podcast episode metadata."""
+ title: str
+ audio_url: str
+ pub_date: datetime
+ description: Optional[str] = None
+ duration: Optional[str] = None
+ guid: Optional[str] = None
+
+
+def fetch_rss_feed(feed_url: str) -> ET.Element:
+ """Fetch and parse RSS feed XML."""
+ resp = requests.get(feed_url, timeout=30)
+ resp.raise_for_status()
+ return ET.fromstring(resp.content)
+
+
+def parse_rss_date(date_str: str) -> datetime:
+ """Parse RFC 822 date format used in RSS feeds."""
+ try:
+ return parsedate_to_datetime(date_str)
+ except (ValueError, TypeError):
+ return datetime.now()
+
+
+def parse_episode(item: ET.Element) -> Optional[Episode]:
+ """Parse an RSS element into an Episode."""
+ title = item.findtext("title", "Untitled")
+
+ # Get audio URL from enclosure tag
+ enclosure = item.find("enclosure")
+ if enclosure is None:
+ return None
+ audio_url = enclosure.get("url")
+ if not audio_url:
+ return None
+
+ pub_date_str = item.findtext("pubDate", "")
+ pub_date = parse_rss_date(pub_date_str)
+
+ description = item.findtext("description")
+ duration = item.findtext("itunes:duration", namespaces=ITUNES_NS)
+ guid = item.findtext("guid")
+
+ return Episode(
+ title=title,
+ audio_url=audio_url,
+ pub_date=pub_date,
+ description=description,
+ duration=duration,
+ guid=guid,
+ )
+
+
+def get_episodes(feed_url: str, count: int = 1) -> List[Episode]:
+ """Fetch the most recent episodes from an RSS feed."""
+ root = fetch_rss_feed(feed_url)
+ channel = root.find("channel")
+ if channel is None:
+ raise ValueError("Invalid RSS feed: no channel element")
+
+ items = channel.findall("item")[:count]
+ episodes = []
+ for item in items:
+ ep = parse_episode(item)
+ if ep:
+ episodes.append(ep)
+ return episodes
+
+
+def download_file(url: str, out_path: pathlib.Path) -> pathlib.Path:
+ """Download a file with streaming to handle large podcast files."""
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+ with requests.get(url, stream=True, timeout=120) as r:
+ r.raise_for_status()
+ with open(out_path, "wb") as f:
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
+ if chunk:
+ f.write(chunk)
+ return out_path
+
+
+def transcribe_with_gemini(audio_path: pathlib.Path) -> str:
+ """
+ Transcribe audio using Google Gemini API.
+
+ Uses File API for files > 20MB (typical for podcasts).
+ Requires GOOGLE_API_KEY environment variable.
+ """
+ import time
+ from google import genai
+ from google.genai import types
+ from google.genai.errors import ClientError
+
+ api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
+ if not api_key:
+ raise EnvironmentError(
+ "GOOGLE_API_KEY or GEMINI_API_KEY environment variable required. "
+ "Get one at https://aistudio.google.com/apikey"
+ )
+
+ client = genai.Client(api_key=api_key)
+
+ file_size = audio_path.stat().st_size
+ file_size_mb = file_size / (1024 * 1024)
+
+ # Determine MIME type from extension
+ suffix = audio_path.suffix.lower()
+ mime_types = {
+ ".mp3": "audio/mp3",
+ ".wav": "audio/wav",
+ ".m4a": "audio/mp4",
+ ".aac": "audio/aac",
+ ".ogg": "audio/ogg",
+ ".flac": "audio/flac",
+ }
+ mime_type = mime_types.get(suffix, "audio/mpeg")
+
+ prompt = (
+ "Transcribe this audio completely and accurately. "
+ "Output only the transcript text, with no timestamps, speaker labels, or annotations. "
+ "Preserve natural paragraph breaks where appropriate."
+ )
+
+ # Models to try in order (fallback on rate limits)
+ models = ["gemini-2.0-flash", "gemini-1.5-flash", "gemini-1.5-pro"]
+
+ uploaded_file = None
+ if file_size_mb > 20:
+ print(f" Uploading {file_size_mb:.1f}MB to Gemini File API...")
+ uploaded_file = client.files.upload(file=str(audio_path))
+
+ last_error = None
+ for model in models:
+ for attempt in range(3): # Retry up to 3 times per model
+ try:
+ if uploaded_file:
+ print(f" Transcribing with {model} (this may take a few minutes)...")
+ response = client.models.generate_content(
+ model=model,
+ contents=[prompt, uploaded_file],
+ )
+ else:
+ with open(audio_path, "rb") as f:
+ audio_bytes = f.read()
+ print(f" Transcribing with {model}...")
+ response = client.models.generate_content(
+ model=model,
+ contents=[
+ prompt,
+ types.Part.from_bytes(data=audio_bytes, mime_type=mime_type),
+ ],
+ )
+
+ # Clean up uploaded file on success
+ if uploaded_file:
+ try:
+ client.files.delete(name=uploaded_file.name)
+ except Exception:
+ pass
+
+ return response.text.strip()
+
+ except ClientError as e:
+ last_error = e
+ if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
+ # Extract retry delay from error if available
+ wait_time = 40 * (attempt + 1) # 40s, 80s, 120s
+ print(f" Rate limited. Waiting {wait_time}s before retry...")
+ time.sleep(wait_time)
+ else:
+ raise # Re-raise non-rate-limit errors
+
+ print(f" Model {model} exhausted, trying next...")
+
+ # Clean up on failure
+ if uploaded_file:
+ try:
+ client.files.delete(name=uploaded_file.name)
+ except Exception:
+ pass
+
+ raise last_error or RuntimeError("All Gemini models failed")
+
+
+def transcribe_with_faster_whisper(audio_path: pathlib.Path) -> str:
+ """
+ Transcribe audio using local faster-whisper model.
+
+ Runs offline, no API key needed.
+ Requires: pip install faster-whisper
+ """
+ from faster_whisper import WhisperModel
+
+ print(" Loading Whisper model...")
+ model = WhisperModel("base", device="cpu", compute_type="int8")
+
+ print(" Transcribing (this may take a while)...")
+ segments, _info = model.transcribe(str(audio_path))
+ text_parts = [seg.text for seg in segments]
+ return "".join(text_parts).strip()
+
+
+def make_safe_filename(title: str, max_length: int = 80) -> str:
+ """Convert a title to a safe filename."""
+ safe = "".join(c for c in title if c.isalnum() or c in (" ", "_", "-"))
+ safe = safe.strip().replace(" ", "_")
+ return safe[:max_length]
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Download and transcribe NYT's The Daily podcast",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ parser.add_argument(
+ "--episode-count",
+ type=int,
+ default=1,
+ help="Number of recent episodes to process (default: 1)",
+ )
+ parser.add_argument(
+ "--engine",
+ choices=["gemini", "whisper"],
+ default="gemini",
+ help="Transcription engine (default: gemini)",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ default="podcasts_out",
+ help="Output directory for transcripts (default: podcasts_out)",
+ )
+ parser.add_argument(
+ "--keep-audio",
+ action="store_true",
+ help="Keep audio files after transcription (default: delete)",
+ )
+ args = parser.parse_args()
+
+ print(f"Fetching RSS feed from The Daily...")
+ episodes = get_episodes(THE_DAILY_RSS, count=args.episode_count)
+
+ if not episodes:
+ print("No episodes found!")
+ return
+
+ print(f"Found {len(episodes)} episode(s)\n")
+
+ out_dir = pathlib.Path(args.output_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ for i, episode in enumerate(episodes, 1):
+ print(f"[{i}/{len(episodes)}] {episode.title}")
+ print(f" Published: {episode.pub_date.strftime('%Y-%m-%d')}")
+ if episode.duration:
+ print(f" Duration: {episode.duration}")
+
+ safe_name = make_safe_filename(episode.title)
+ audio_path = out_dir / f"{safe_name}.mp3"
+ output_file = out_dir / f"{safe_name}.json"
+
+ # Skip if already processed
+ if output_file.exists():
+ print(f" Already processed, skipping...")
+ continue
+
+ # Download audio
+ print(f" Downloading audio...")
+ download_file(episode.audio_url, audio_path)
+ file_size_mb = audio_path.stat().st_size / (1024 * 1024)
+ print(f" Downloaded {file_size_mb:.1f}MB")
+
+ # Transcribe
+ if args.engine == "gemini":
+ transcript = transcribe_with_gemini(audio_path)
+ else:
+ transcript = transcribe_with_faster_whisper(audio_path)
+
+ # Save output
+ meta = {
+ "title": episode.title,
+ "source": "NYT The Daily",
+ "feed_url": THE_DAILY_RSS,
+ "pub_date": episode.pub_date.isoformat(),
+ "duration": episode.duration,
+ "audio_url": episode.audio_url,
+ "transcription_engine": args.engine,
+ "transcript": transcript,
+ }
+
+ with open(output_file, "w", encoding="utf-8") as f:
+ json.dump(meta, f, ensure_ascii=False, indent=2)
+
+ print(f" Saved: {output_file}")
+
+ # Clean up audio unless --keep-audio
+ if not args.keep_audio and audio_path.exists():
+ audio_path.unlink()
+ print(" Deleted audio file")
+
+ print()
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/podcasts_out/Trumps_Plan_to_Reorder_the_World.json b/podcasts_out/Trumps_Plan_to_Reorder_the_World.json
new file mode 100644
index 000000000..e296bf421
--- /dev/null
+++ b/podcasts_out/Trumps_Plan_to_Reorder_the_World.json
@@ -0,0 +1,10 @@
+{
+ "title": "Trumpβs Plan to Reorder the World",
+ "source": "NYT The Daily",
+ "feed_url": "https://feeds.simplecast.com/Sl5CSM3S",
+ "pub_date": "2025-12-12T10:45:00+00:00",
+ "duration": "00:35:54",
+ "audio_url": "https://dts.podtrac.com/redirect.mp3/pdst.fm/e/pfx.vpixl.com/6qj4J/pscrb.fm/rss/p/nyt.simplecastaudio.com/03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/4b70ef33-ff51-4c3e-a5cd-5f0c41880d27/audio/128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=4b70ef33-ff51-4c3e-a5cd-5f0c41880d27&feed=Sl5CSM3S",
+ "transcription_engine": "whisper",
+ "transcript": "From the New York Times, I'm Natalie Kietroweth. This is the date. From slamming Europe and abandoning our commitments to our closest allies there, to carrying out a lethal U.S. military campaign in the Caribbean, President Trump has overseen an aggressive foreign policy that hasn't always been easy to understand. But the White House has now unveiled a national security strategy that offers a justification for those actions, laying bare Trump's true goals and alarming countries around the world. Today, my colleague David Sanger explains what the strategy actually is and how the emerging Trump doctrine it represents may change America's global relationships for good. It's Friday, December 12th. David, our resident foreign policy expert, it's great to have you here. Natalie, always great to be with you. There has been a lot of debate on the right among voters about Trump's focus on international affairs, a notion that he's not following through on a stated agenda of putting America and Americans first. And it's true that Trump in his second term has been extremely active around the world. He's been engaging in a trade war with China. He's bombed Iran's nuclear facility, broker to cease fire in Gaza. There's been the recent boat strikes in Latin America. And through all that, it hasn't always been all that clear exactly how all of these actions cohere. But now the Trump administration has released this document that tries to articulate the country's foreign policy strategy, that tries to make sense of it all. So first of all, what is this document? So this is the national security strategy. And administrations don't turn it out because they want to. They turn it out because they have to. Congress actually requires every administration to go do it. But it also ends up becoming a kind of Rorschach test of what administrations priorities are. And in this particular case, as you read this document, it's only about 30 pages long. The thing that really strikes you is that it is a retreat from the post-World War II bipartisan understanding that the role of the United States is to defend liberty, support democracies around the world, support our allies. And there's an absence in this strategy of a sort of moral mission for the United States to defend human rights, to defend free speech or free press. Almost all of that is gone. And instead, there's one really telling line that's on page 12 of the strategy. It says the days of the United States propping up the entire world order like atlas are over. And if those days are over, if that's no longer our priority, being the defender of liberty around the world, what is the new priority? Well, our priority is the latest interpretation of America first. And that means emphasizing not only trade, but making America wealthy. How many times have you heard President Trump say that? He said it again on Tuesday night in Pennsylvania that he would make America rich again on its way to making it great again. It is a document that is very heavy on how the United States will try to order the world for its benefit. You know, I've seen this in the coverage around this document that there is a focus on making America, as you said, wealthy, a focus on profit. What does that actually mean? Well, the president's concept here is that our greatest source of national strength is being the economic leader, the technological leader. Now, parts of this are quite common with Democrats and other Republican presidents. You saw Joe Biden try to bring semiconductor manufacturing back to the United States. But Trump is taking this to the next level here. He's basically saying that all the policies of the U.S. should be geared toward improving our wealth and our economic security. And he focuses many more pages on that than the traditional issues of national security. And what really struck me, Natalie, is this is not only different from most of the national security strategies we've seen since the end of World War II, it's dramatically different from Donald Trump's own national security strategy when he first came to office in 2017. Break that down for me. How is it different from Trump's own previous policy? Well, in 2017, his national security advisor looked around the national security landscape, and basically came to the conclusion that it was all still focused on counterterrorism, the understandable result of 9-11 and its aftermath and the wars in Iraq and Afghanistan. And that national security strategy in 2017 indicated that the U.S. government had to rapidly shift to a new era of superpower conflict, one in which Russia was a rising and aggressive power seeking to challenge the United States and its dominance, particularly in the West and in Europe, and that we had to counter China, the only country that could take us on militarily, financially, technologically, and if you think about something like TikTok, maybe even culturally. And so that document in 2017 basically reoriented the national security establishment of the United States toward thinking about how you would fight a new era of cold wars around the world. In this document, there's a hint of that, but not very much. It focuses on entirely different issues. The old document spent a lot of time on how the United States would deal with threats from rogue states. There are pages on North Korea, which at the time had about 20 nuclear weapons and was run by an erratic leader. In the new document, there's no mention of North Korea, in the entire 30 pages, even though they now have three times as many nuclear weapons, and they're still run by the same erratic leader. And Iran gets only the briefest mention, and then only to mention that the president sent stealth bombers over to take out three major nuclear sites back in June. But there's no follow-up about what the strategy would be to avoid future war with Iran. You're saying, basically, David, that there's much less direct discussion here of our adversaries and how to counter them. North Korea isn't in this document at all. Iran isn't there very much. So what does this document focus on in terms of our national security concerns? Well, Natalie, there's a lot of discussion and a lot of criticism of our closest allies, the Europeans. You know, I think the first thing that you see in this document is something that's pretty familiar to us all from the first term and certainly from the past year, which is that America is tired of supporting the allies, that it won't put up with Europe's trade blocks anymore, that is frustrated with the European Union, which, of course, the president has said was built to screw the United States, that we can't necessarily be supporting them in their conventional defense. And it makes clear that they have to go do that themselves. Now, the fact of the matter is, as the document acknowledges, the Europeans made a lot of progress in this regard over the past year. And you may remember that back over the summer, they committed to spend up to 5 percent of their GDP on defense. And that was a huge win for President Trump. And in my mind, something that was really long overdue and one of the big successes that he had this year. Right, they went from 2 percent of GDP to now saying they'd spend up to 5 percent. It was a huge increase. Huge increase, some of it's for concrete defense. And I think it's certainly fair to say that his threat to leave NATO and to abandon Europe certainly focused their attention. The next big debate we have, of course, is whether or not it was in our interest as well, because we get a lot of benefits from a tight alliance with the Europeans who can act as a deterrent against war with Russia and other bad actors. So yes, it was a big win. It may have come at some long-term cost. But let's acknowledge that President Trump was able to do what Barack Obama and Joe Biden and Trump himself in his first term proved unable to do, which is get the Europeans belatedly to take their defense seriously. So there's clearly an upside from the US perspective in getting Europe to kick in more for its own defense. But as advanced as Europe is, it can't compete with the US military power yet, right? It would take them a long time to build up their militaries to that point. And I guess I wonder if the US is retreating from supporting Europe on defense. That will make Europe less capable of countering what it sees as one of its biggest threats right now, which is Russia. Does this strategy contend with the potential that Russia gains an upper hand over Europe if US military support receipts? So Natalie, the Russia section of this is one of the strangest, because it suggests that the Europeans were a greater threat to themselves than Russia is to the future of Europe. And this is the exact opposite of how the Europeans view it, because they believe now that Russia is an existential threat to them. And if successful in Ukraine will just keep going sooner or later. And that is a huge shift, of course. But it's one that the Europeans have seen coming from the US all year. They may not have seen it in black and white the way they did in the strategy, but it was certainly no surprise. But what they weren't ready for was this line on page 25 of the report that talks about Europe's economic decline. But then it also discusses the waves of migration that have changed the nature of European democracies. And it warns that this economic decline is eclipsed by the real and more stark prospect of civilizational erasure. And that's the line that really resonated in Europe. Yeah, let's just pause there, because that line really made a lot of waves. So what does the Trump administration mean when it says there's civilizational erasure happening in Europe? Well, this has been a topic of great debate about how you interpret that. But I think the most common interpretation is that the president is saying that the migration that has changed the face of Germany, of France, even of Britain has fundamentally altered the nature of these European allies, and that Europe that the president came to think up. His parents emerged. His mother was a Scottish immigrant. His father was the first generation descendant of German immigrants to the United States in the late 1880s that that Europe was almost gone. And many read that line as a complaint that there is a diminishment of the white European allies that the president imagines when he thinks of Europe. And I think what this document is doing is saying Europe's threatening its own future existence and identity. Well, you're starting to point at this, David, but just explain why this concept that migration is changing the demographic makeup of Europe and the culture, perhaps, of Europe. Why is it important for the Trump administration? Like what does it have to do with American foreign policy? I think it has to do with their image of countries with common values, not only with the United States, but with President Trump and the MAGA movement. And there was a lot of this in the JD Vance speech to the Munich Security Conference last February that was such a shock where he said, your big adversary is not Russia. Your big adversary is the waves of migration that are changing your societies. And I think the Europeans who viewed that diversity as a strength, a revitalization of Europe, were truly shocked to hear that. And this whole section of the National Security Strategy reads like it is an expansion of the Vance speech in Munich in February. Can I just ask, I understand that there were many Europeans who were shocked by this when Vance brought it up and perhaps have been shocked by it in this document. But it's true, right, that there has been this massive wave of migration across Europe. And there is a lot of discontent with it. There are a lot of people who aren't in favor of it and don't see it as something they want in their countries. That's absolutely right. And you sense this whenever you're in Europe and you just read it in the headlines, see it in protests on the streets, see it in the clashes between these new migrants who are coming in and traditional Europeans. So the core of the Trump argument in this document is basically a warning to the Europeans that you are ignoring your own voters, that you're suppressing free speech by suppressing the right wing by trying to keep them down, by refusing to allow them to take power, right? And that's true in their minds in Germany, in their minds it's true in France, and their minds it's even true in Great Britain. And just explain why that's problematic in the eyes of the Trump administration. Well, to the Trump administration, I think there are sort of two reasons. The stated reason is that a Europe that is divided like this, that is suppressing the will of its own voters, that's keeping the right wing from coming up, is basically an unstable Europe. That can't as the document says, operate as a group of aligned sovereign nations, taking primary responsibility for their own defense without being dominated by an adversarial power. But I think we have to allow for the possibility, Natalie, that what's really going on here is they want like-minded, mega-oriented governments in these nations. And they think the current European establishment is standing in the way of that goal. And so what if anything is the Trump administration saying they're going to do about that problem? Well, they're vague about what they do about it. There is this line saying that among their priorities is cultivating resistance to Europe's current trajectory within European nations. But we don't really know what that means. Does it mean that the president is going to endorse right wing patriotic candidates as if he was endorsing Republican or MAGA-oriented governors or senators running for election in the United States? Would he be interfering in their elections? He doesn't say except in the traitor arena where, of course, he's quite specific about his goals. So just to sum up what you've told us about the Europe policy that's articulated here, this is much of what we've seen already in the Trump administration saying basically Europe you're on your own in terms of paying for your own defense, we're not going to be as involved as we have been. And then you're also seeing this expressed concern over mass migration in Europe and civilizational erasure. So what do you make of this altogether? How should we interpret it? It's hard to tell because parts of it are contradictory, but you emerge from reading the document thinking that the United States is carving out an exception for itself to step in and intervene in Europe to get to the kind of society that President Trump and his allies think they want and think that many Europeans want. And yet buried in this assertion of a right to interfere with Europe's internal politics, even directly engaging with European voters, there's this sort of strange undertone of retreat, a real sense that overall the US is turning away from Europe. And so then the question becomes, if the US is retreating from our traditional European allies, where are we turning? Well, for the past decade, the Europeans have been worried that the US is turning to Asia, that it's focusing on China and Japan and South Korea, the booming economies. But what this document says is that the US is ready once again to turn its attention to our own region, to focus on our own backyard. We'll be right back. David, before the break, you said that this document articulates that our backyard is now the focus of America's foreign policy. So what exactly does the Trump administration want to do in our backyard? And how does it define our backyard? Well, let's start with the second question first, Natalie, because you're talking about a president who spent his life as a real estate mogul. And the real estate he has in mind here is pretty big. It goes from Canada over to Greenland. It runs down through the newly named Gulf of America, through the Panama Canal, which of course he said we never should have given away all the way down to the tip of Argentina. And the president's idea here is that the United States should have complete and total dominance of the Western hemisphere. And so the president advocates in this document that we return to and expand on the Monroe doctrine. Now, at the risk of making some of our listeners shake in fear as they try to recall 11th grade history. Yeah, let's get this spark notes. Absolutely. We're slipping you the copy right now, Natalie. The Monroe doctrine, which dates back to 1823, declared that the Western hemisphere would essentially be closed to European colonization. The Europeans had to stay out of our territory, which was a pretty bold thing to say for a country that was about 45 years old and barely had a navy at the time, right? But over the following 200 years, we both expanded from and retreated from the Monroe doctrine. How so? Well, Teddy Roosevelt in 1905 issued a Roosevelt corollary to this doctrine that basically said we reserve the right to intervene in Latin America if we see governments that are coming together that are not to our liking. And we did exactly that, right? We got involved in a civil uprising in Colombia, and we ended up with the Panama Canal. I mean, we did a whole bunch of actions up through the 1950s when the CIA was conducting coups in the region. But then we turned away for a bit. We had the Cold War, we had the collapse of the Soviet Union, we had 9.11, and we began to think as China rose, that it's really in the Indo-Pacific that we needed to concentrate our forces. What this document is saying is it's time to come home again and to focus on our home region. And when you read the actual document, the president talks about building on the Monroe doctrine and creating a sort of Trump corollary to it. What it's saying is we are going to control access to the region, we're going to stop drugs to the region, and we're going to make sure that you see the US military in the region. And here's where the document gets more specific than in almost any other place in the 30 pages. There's a page where it says we're going to readjust our global military presence to address these urgent threats in our own hemisphere because we live here. Well, that may not sound like a big decision, but we've had a succession of American presidents, Democrats, Republicans alike, who have said we are going to go focus on the Indo-Pacific because that's where our future is, that's where our trade is, that's where China and India are. So this would basically put a halt to that kind of expansion in a world of limited resources and bring those forces back home. It says that we're going to design a more suitable Coast Guard and Navy presence to control sea lines. It says we're going to defeat cartels and put people at the border to do that. And where necessary, the use of lethal force to replace the failed law enforcement only strategy of the last several decades. So what it does is essentially give a retroactive justification or rationale for what we've seen with the US military sinking these boats in the Caribbean and killing alleged drug runners. That's right, it's giving a strategic rationale for things he is wanted to do and is doing anyway, but it even goes beyond that, Natalie, because there are sections of it that explicitly say that to make this work, we're going to kick other powers out of our region. And that's code word for China, because in the past, Natalie, when the US turned its attention away from the Western hemisphere, when his focus was on China and the Indo-Pacific, the irony is that the Chinese were moving into Latin America and really made huge inroads there economically. And that makes it all the stranger that the document doesn't really name China specifically as the big player in the region. And there's been some debate about why that is. Some people think it was the work of Scott Besan, the Treasury Secretary, who's trying to negotiate trade deals with China. He doesn't want to particularly anger them before President Trump's schedule visit to Beijing in April. But what it certainly does is it suggests the US is going to boot out anybody who it thinks shouldn't be there. And what does it actually mean to kick China out of Latin America? Well, in some cases, the President has declared he wants Chinese companies physically out of say the Panama Canal, where he exaggerated the control of the Chinese military, which has very little presence there. But what he really means is that by expanding the American economic presence, by making sure that everyone in the region buys American products and runs on American operating systems, that basically he would squeeze out the Chinese and other competitors. And of course, you travel on Latin America. What do you see? You see Huawei phones. You see Chinese 5G networks. And it's those areas where I think the President wants to get that replaced with American hardware and software. David, just to step back here for a moment, can you help me understand how the taking up of the Monroe doctrine, this new muscular posture that we're going to be taking in Latin America with new military deployments? Isn't that out of step with the original understanding that I think a lot of us had of America first as this policy of isolationism? There is certainly a split in MAGA world about isolationism and interventionism. And there are a lot of the President's followers who would like to just build big walls around the continental United States, maybe include Hawaii and Alaska in that and basically say, everybody stay away and we're not going to mess in your business. You saw MAGA members who were opposed to taking out the nuclear facilities in Iran. The President basically told them to sit down and shut up while he bombed them and then came home. But what we're discussing in Latin America is really pulling MAGA apart here because they see in our conflicts over the boats in the President's promise repeated just this week that there would be land strikes that we could end up in forever wars in our own neighborhood. So what makes this worth it to Trump? Like why risk pulling MAGA apart as you said over this? What does the US actually get out of reasserting its dominance in this really aggressive way? Well, one possibility of what we get out of it is basically a spheres of influence kind of organization of the world, something we haven't really seen since the late 1800s. This is a world in which the United States dominates its own territory that China dominates the Pacific and that the Europeans dominate Europe. But if they don't get their act together, maybe Vladimir Putin dominates Europe. It establishes essentially that we each carve up the globe and sort of respect the other territories as the other guy's problem. And of course, this is a vision that coincides with another world leader's idea of how the globe should be organized. And that's Putin himself who has frequently talked about the spheres of influence kind of organization of the world. But how is declaring a sphere of influence coherent with America first? I mean, you could see how leaving the rest of the world to its own devices jibes with an isolationist view of how to engage with the world. But help me make sense of the internal logic of declaring American predominance over an entire swath of the globe. I think this is where we see the America first doctrine becoming something closer to America's first. With an S. America's with an S that he views the region as basically the subsidiary of the United States. And you know, I've traveled with President Trump. I've covered five American presidents since I got back to Washington from all life as a foreign correspondent. And my takeaway is that Trump is really not an isolationist. He never has been. He's actually more of a unilateralist. What do you mean by that? Well, he wants the total freedom of action. He knows that he is not really interested in democracy promotion. He knows that he wants to prioritize economics and economic development over everything. Even if those economics don't necessarily come with security benefits to the US. But I also think that what's really notable about this strategy is that it doesn't cast our traditional adversaries, China and Russia, but mostly China, as global strategic challengers, much less a threat to the US. So one would think from these documents that Europe's troubles pose a greater threat to the US than any of the above. David, we've been talking about this document as a major pivot and a reorientation of American foreign policy. But I have to ask, as someone who has spent so much time covering American leaders and American actions across the globe, how enduring is the shift that we're seeing represented here? Does it last beyond this president? The closest analogy I can make is Trump and the White House itself. The next president can come in and scrape all the gold off of the Oval Office walls and put turf back down in the Rose Garden. But whoever it is is not going to be able to go rebuild the East Wing. There's going to be a ballroom and you're going to have to learn how to live with it or like it. And my guess is that the foreign policy of this president is going to have a similar effect. That at this point, the world is going to assume that the United States always has the ability to turn back in on itself and that each region of the world and even our allies are going to have to learn to depend on themselves. And I don't think that there is anything we can do over the next generation no matter who becomes elected president to make them believe that the U.S. is always going to be with them. I think the fundamental trust in the U.S. as the defender of a certain set of concepts of the West has been shattered for some time. David, thanks so much. Thank you, Natalie. We'll be right back. Here's what else you need to know today. This is one of the most consequential votes this Senate will take all year. By saying yay or nay to the clerk of the Senate, senators will decide whether people live or people die. On Thursday afternoon, Senate Republicans blocked a Democratic bill to extend federal healthcare subsidies, making it all but certain that insurance costs will surge for millions of Americans by the end of the year. The bill would have extended the subsidies for the Affordable Care Act by three years, a rallying cry for Democrats and their central motivation for shutting down the government for 43 days. But even with four Senate Republicans backing the measure, Democrats fell short of the 60 votes they needed to pass it. And President Trump signed an executive order that seeks to block states from enforcing laws that regulate the artificial intelligence industry, a win for big tech that puts dozens of AI safety and consumer protection laws at risk. The order gives the Attorney General broad authority to overturn laws that don't support the quote, United States' global AI dominance. If states keep their laws in place, Trump directed federal regulators to withhold funds for broadband and other projects. Finally, in a stinging defeat for Trump, Republican state lawmakers in Indiana have rejected a new congressional map ordered up by the White House that would have made it harder for Democrats to win any congressional seats in the state. The Republican lawmakers who voted against the new map said that it would undermine people's faith in government, and warn Trump that he should stay out of the state's politics. Today's episode was produced by Olivia Nat and Anna Foley. It was edited by Maria Byrne and Lizzo Baelin with help from Paige Cowatt. Contains music by Alicia Byrne and Mary Luzano and was engineered by Chris Wood. That's it for the Daily, I'm Natalie Kitchoweth. See you Monday."
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..7068f253a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+# Core dependencies
+requests>=2.31.0
+
+# Gemini API (primary transcription engine)
+google-genai>=0.3.0
+
+# Local transcription fallback (offline, no API key needed)
+faster-whisper>=0.10.0
diff --git a/setup_youtube_api.sh b/setup_youtube_api.sh
new file mode 100755
index 000000000..24e6f43b3
--- /dev/null
+++ b/setup_youtube_api.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Setup YouTube API Key for PodVibe.fm
+
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo "β β"
+echo "β π YouTube API Key Setup β"
+echo "β β"
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+
+cd src
+
+# Check if .env exists
+if [ -f ".env" ]; then
+ echo "β .env file found"
+ echo ""
+
+ # Check if it has YOUTUBE_API_KEY
+ if grep -q "YOUTUBE_API_KEY" .env; then
+ echo "βΉοΈ YOUTUBE_API_KEY already in .env"
+ echo ""
+ current_key=$(grep "YOUTUBE_API_KEY=" .env | cut -d'=' -f2)
+ if [ -z "$current_key" ] || [ "$current_key" = "your_youtube_api_key_here" ]; then
+ echo "β οΈ But it's not set to a real value yet."
+ echo ""
+ else
+ echo "β YouTube API Key appears to be configured!"
+ echo ""
+ read -p "Do you want to update it? (y/N): " update
+ if [[ ! $update =~ ^[Yy]$ ]]; then
+ echo "Keeping existing configuration."
+ exit 0
+ fi
+ fi
+ fi
+else
+ echo "π Creating new .env file..."
+ touch .env
+fi
+
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo " HOW TO GET YOUTUBE API KEY"
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+echo "1. Go to: https://console.cloud.google.com/"
+echo "2. Create/select a project"
+echo "3. Enable 'YouTube Data API v3'"
+echo "4. Create Credentials β API Key"
+echo "5. Copy the API key"
+echo ""
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+
+read -p "Enter your YouTube API Key: " YOUTUBE_KEY
+
+if [ -z "$YOUTUBE_KEY" ]; then
+ echo ""
+ echo "β No key entered. Exiting."
+ exit 1
+fi
+
+# Check if GEMINI_API_KEY exists
+if ! grep -q "GEMINI_API_KEY" .env; then
+ echo ""
+ read -p "Enter your Gemini API Key (required): " GEMINI_KEY
+
+ cat > .env << EOF
+# Google Gemini API Key (Required)
+GEMINI_API_KEY=$GEMINI_KEY
+
+# YouTube Data API Key (Optional - for real trending data)
+YOUTUBE_API_KEY=$YOUTUBE_KEY
+EOF
+else
+ # Just add/update YouTube API key
+ if grep -q "YOUTUBE_API_KEY" .env; then
+ # Update existing
+ if [[ "$OSTYPE" == "darwin"* ]]; then
+ sed -i '' "s|YOUTUBE_API_KEY=.*|YOUTUBE_API_KEY=$YOUTUBE_KEY|" .env
+ else
+ sed -i "s|YOUTUBE_API_KEY=.*|YOUTUBE_API_KEY=$YOUTUBE_KEY|" .env
+ fi
+ else
+ # Add new line
+ echo "" >> .env
+ echo "# YouTube Data API Key (Optional - for real trending data)" >> .env
+ echo "YOUTUBE_API_KEY=$YOUTUBE_KEY" >> .env
+ fi
+fi
+
+echo ""
+echo "β YouTube API Key configured!"
+echo ""
+echo "π§ͺ Testing configuration..."
+python3 << PYEOF
+import os
+from dotenv import load_dotenv
+load_dotenv()
+gemini = os.getenv('GEMINI_API_KEY')
+youtube = os.getenv('YOUTUBE_API_KEY')
+print(f"β GEMINI_API_KEY: {'Set' if gemini else 'Missing'}")
+print(f"β YOUTUBE_API_KEY: {'Set' if youtube else 'Missing'}")
+if youtube:
+ print("\nπ You'll get REAL trending videos from YouTube!")
+PYEOF
+
+echo ""
+echo "Start backend with: python3 api.py"
+echo ""
+
diff --git a/src/App.tsx b/src/App.tsx
new file mode 100644
index 000000000..b5fb6637e
--- /dev/null
+++ b/src/App.tsx
@@ -0,0 +1,16 @@
+import { BrowserRouter, Routes, Route } from "react-router-dom";
+import { Home } from "./pages/Home";
+import { Browse } from "./pages/Browse";
+import { Player } from "./pages/Player";
+
+export function App() {
+ return (
+
+
+ } />
+ } />
+ } />
+
+
+ );
+}
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 000000000..32213f65f
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1 @@
+# YouTube Podcast Summarizer Package
diff --git a/src/api.py b/src/api.py
new file mode 100644
index 000000000..01faece36
--- /dev/null
+++ b/src/api.py
@@ -0,0 +1,177 @@
+"""
+Flask API Backend for PodVibe.fm
+Connects React frontend with the Agentic AI system
+"""
+
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import sys
+import os
+from datetime import datetime
+
+# Add src to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from youtube_summarizer import YouTubeSummarizer
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+app = Flask(__name__)
+CORS(app) # Enable CORS for React frontend
+
+# Initialize the summarizer (lazy loading)
+summarizer = None
+
+def get_summarizer():
+ """Lazy load the summarizer to avoid initialization errors"""
+ global summarizer
+ if summarizer is None:
+ summarizer = YouTubeSummarizer()
+ return summarizer
+
+
+@app.route('/api/health', methods=['GET'])
+def health_check():
+ """Health check endpoint"""
+ return jsonify({
+ 'status': 'healthy',
+ 'timestamp': datetime.now().isoformat()
+ })
+
+
+@app.route('/api/summarize', methods=['POST'])
+def summarize_video():
+ """
+ Summarize a YouTube video
+
+ Request JSON:
+ {
+ "url": "https://www.youtube.com/watch?v=..."
+ }
+
+ Response JSON:
+ {
+ "success": true,
+ "video_id": "...",
+ "summary": "...",
+ "transcript_length": 12345,
+ "segments": 123,
+ "timestamp": "2025-12-12T10:30:00",
+ "memory_log": [...]
+ }
+ """
+ try:
+ # Get URL from request
+ data = request.get_json()
+ url = data.get('url', '').strip()
+
+ if not url:
+ return jsonify({
+ 'success': False,
+ 'error': 'YouTube URL is required'
+ }), 400
+
+ # Validate URL format
+ if 'youtube.com' not in url and 'youtu.be' not in url:
+ return jsonify({
+ 'success': False,
+ 'error': 'Invalid YouTube URL'
+ }), 400
+
+ # Process the video using the agentic AI system
+ print(f"π¬ Processing YouTube URL: {url}")
+ summ = get_summarizer()
+ result = summ.process_youtube_url(url)
+
+ # Get memory log
+ memory_log = summ.get_memory_log()
+
+ # Format response
+ response = {
+ 'success': True,
+ 'video_id': result.get('video_id', ''),
+ 'summary': result.get('summary', ''),
+ 'keywords': result.get('keywords', []),
+ 'transcript_length': result.get('transcript_length', 0),
+ 'segments': result.get('segments', 0),
+ 'timestamp': datetime.now().isoformat(),
+ 'memory_log': memory_log
+ }
+
+ return jsonify(response), 200
+
+ except Exception as e:
+ print(f"β Error processing request: {str(e)}")
+ return jsonify({
+ 'success': False,
+ 'error': str(e)
+ }), 500
+
+
+@app.route('/api/models', methods=['GET'])
+def list_models():
+ """List available Gemini models"""
+ try:
+ import google.generativeai as genai
+
+ genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
+ models = genai.list_models()
+
+ model_list = [
+ {
+ 'name': m.name,
+ 'display_name': m.display_name,
+ 'description': m.description
+ }
+ for m in models
+ if 'generateContent' in m.supported_generation_methods
+ ]
+
+ return jsonify({
+ 'success': True,
+ 'models': model_list
+ }), 200
+
+ except Exception as e:
+ return jsonify({
+ 'success': False,
+ 'error': str(e)
+ }), 500
+
+
+@app.route('/api/trending', methods=['GET'])
+def get_trending():
+ """Get trending videos by category using YouTube API or sample data"""
+ from trending import get_trending_podcasts_route
+ return get_trending_podcasts_route()
+
+
+if __name__ == '__main__':
+ print("π Starting PodVibe.fm API...")
+ print("π‘ React frontend should connect to: http://localhost:8000")
+ print("π§ API endpoints available:")
+ print(" - GET /api/health - Health check")
+ print(" - POST /api/summarize - Summarize YouTube video")
+ print(" - GET /api/models - List available models")
+ print(" - GET /api/trending - Get trending videos")
+ print("\n" + "="*80)
+
+ # Check for API keys
+ if not os.getenv('GEMINI_API_KEY'):
+ print("β οΈ WARNING: GEMINI_API_KEY not found in environment!")
+ print(" Make sure your .env file is configured correctly.")
+ else:
+ print("β GEMINI_API_KEY found")
+
+ if not os.getenv('YOUTUBE_API_KEY'):
+ print("βΉοΈ YOUTUBE_API_KEY not found - using sample data for trending")
+ print(" To get real trending videos, add YOUTUBE_API_KEY to .env")
+ print(" Run: ./setup_youtube_api.sh")
+ else:
+ print("β YOUTUBE_API_KEY found - using real YouTube data!")
+
+ print("="*80 + "\n")
+
+ app.run(host='0.0.0.0', port=8000, debug=True)
diff --git a/src/app.py b/src/app.py
new file mode 100644
index 000000000..79543655c
--- /dev/null
+++ b/src/app.py
@@ -0,0 +1,173 @@
+"""
+Streamlit UI for PodVibe.fm
+"""
+
+import streamlit as st
+import sys
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Add src to path
+sys.path.append(os.path.dirname(__file__))
+
+from youtube_summarizer import YouTubeSummarizer
+
+
+def main():
+ st.set_page_config(
+ page_title="PodVibe.fm - AI Podcast Summarizer",
+ page_icon="ποΈ",
+ layout="wide"
+ )
+
+ st.title("ποΈ PodVibe.fm")
+ st.markdown("**AI-Powered Podcast Summarizer | Powered by Google Gemini**")
+ st.markdown("---")
+
+ # Sidebar for configuration
+ with st.sidebar:
+ st.header("βοΈ Configuration")
+
+ api_key = st.text_input(
+ "Gemini API Key",
+ type="password",
+ help="Enter your Google Gemini API key",
+ value=os.getenv('GEMINI_API_KEY', '')
+ )
+
+ summary_type = st.selectbox(
+ "Summary Type",
+ options=['comprehensive', 'brief', 'key_points'],
+ help="Choose the type of summary you want"
+ )
+
+ st.markdown("---")
+ st.markdown("### π About")
+ st.markdown("""
+ This app uses:
+ - **YouTube Transcript API** to fetch video transcripts
+ - **Google Gemini** to generate intelligent summaries
+ - **Agentic AI** pattern for step-by-step processing
+ """)
+
+ # Main content
+ col1, col2 = st.columns([2, 1])
+
+ with col1:
+ st.header("π Enter YouTube URL")
+ youtube_url = st.text_input(
+ "YouTube Video URL",
+ placeholder="https://www.youtube.com/watch?v=...",
+ help="Paste the URL of any YouTube video with available captions"
+ )
+
+ with col2:
+ st.header("π Actions")
+ process_button = st.button("Generate Summary", type="primary", use_container_width=True)
+
+ # Process video when button is clicked
+ if process_button:
+ if not api_key:
+ st.error("β Please enter your Gemini API key in the sidebar")
+ return
+
+ if not youtube_url:
+ st.error("β Please enter a YouTube URL")
+ return
+
+ try:
+ # Initialize summarizer
+ with st.spinner("π§ Initializing AI agent..."):
+ summarizer = YouTubeSummarizer(api_key=api_key)
+
+ # Create progress container
+ progress_container = st.container()
+
+ with progress_container:
+ st.markdown("### π Processing Steps")
+
+ # Step 1: Extract video ID
+ with st.spinner("π Step 1: Extracting video ID..."):
+ video_id = summarizer.extract_video_id(youtube_url)
+ st.success(f"β Video ID: `{video_id}`")
+
+ # Step 2: Fetch transcript
+ with st.spinner("π₯ Step 2: Fetching transcript..."):
+ transcript = summarizer.get_transcript(video_id)
+ st.success(f"β Transcript fetched ({len(transcript):,} characters)")
+
+ # Step 3: Generate summary
+ with st.spinner(f"π€ Step 3: Generating {summary_type} summary..."):
+ summary = summarizer.summarize_text(transcript, summary_type)
+ st.success("β Summary generated!")
+
+ # Display results
+ st.markdown("---")
+ st.header("π Summary")
+
+ # Create tabs for different views
+ tab1, tab2, tab3 = st.tabs(["π Summary", "π Full Transcript", "π§ Agent Memory"])
+
+ with tab1:
+ st.markdown(summary)
+
+ # Download button
+ st.download_button(
+ label="β¬οΈ Download Summary",
+ data=summary,
+ file_name=f"summary_{video_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+ mime="text/plain"
+ )
+
+ with tab2:
+ with st.expander("Show full transcript"):
+ st.text_area("Transcript", transcript, height=400)
+
+ with tab3:
+ st.markdown("### Agent Activity Log")
+
+ # Show session summary
+ session_summary = summarizer.get_session_summary()
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ st.metric("Total Events", session_summary['total_events'])
+ with col2:
+ st.metric("Tasks Completed", session_summary.get('event_breakdown', {}).get('task_completed', 0))
+ with col3:
+ st.metric("Tasks Failed", session_summary.get('event_breakdown', {}).get('task_failed', 0))
+
+ st.markdown("---")
+
+ # Show execution timeline
+ st.markdown("#### Execution Timeline")
+ timeline = summarizer.get_execution_timeline()
+ for event in timeline:
+ with st.expander(f"π {event['timestamp']} - {event['event']}"):
+ st.json(event['details'])
+
+ # Full memory log
+ st.markdown("#### Complete Memory Log")
+ memory_log = summarizer.get_memory_log()
+ for entry in memory_log:
+ with st.expander(f"{entry['event_type']} - {entry['timestamp']}"):
+ st.json(entry['data'])
+
+ except Exception as e:
+ st.error(f"β Error: {str(e)}")
+ st.exception(e)
+
+ # Footer
+ st.markdown("---")
+ st.markdown("""
+
+ Built with β€οΈ using Streamlit and Google Gemini | Agentic AI Hackathon Template
+
+ );
+}
diff --git a/src/pages/Home.tsx b/src/pages/Home.tsx
new file mode 100644
index 000000000..06636a946
--- /dev/null
+++ b/src/pages/Home.tsx
@@ -0,0 +1,206 @@
+import { Link } from "react-router-dom";
+
+export function Home() {
+ return (
+
+ {/* Nav */}
+
+
+ {/* Hero */}
+
+
+
+ Skip the fluff.
+
+ Get the wisdom.
+
+
+ AI-powered podcast insights. We extract the 20% that matters and deliver it in bite-sized clips you can actually use.
+
+
+
+
+ See How It Works
+
+
+
+
+
+ {/* Problem/Solution */}
+
+
+
+
+
The Problem
+
Podcasts are long. Your time isn't.
+
+ The average podcast is 90 minutes. The valuable insights? Maybe 15 minutes scattered throughout. You don't have time to hunt for gold in every episode.
+
+
+
+
The Solution
+
AI that listens so you don't have to.
+
+ PodVibe extracts the key insights, timestamps them, and delivers them as clips. Ask a question, get the answer. Like having a research assistant for every podcast.
+
+
+
+
+
+
+ {/* How It Works */}
+
+
+
+
How it works
+
Three steps to podcast enlightenment
+
+
+
+
1
+
Upload or paste
+
Drop in a podcast episode URL or upload an audio file. We handle the rest.
+
+
+
2
+
AI extracts insights
+
Our AI identifies key moments, ranks them by value, and maps them to precise timestamps.
+
+
+
3
+
Listen to clips
+
Get TikTok-length wisdom drops. Search, browse, or let our guide recommend what's next.
+
+
+
+
+
+ {/* Example Insight Card */}
+
+
+
+
What an insight looks like
+
Structured knowledge, ready to use
+
+
+
+
+
+
12:34 - 13:21 · The Tim Ferriss Show
+
The best investments are often the ones you say no to.
+
+
+
+ "I've found that 90% of my returns came from 10% of my investments. But more importantly, 90% of my peace of mind came from the 100 deals I turned down."
+
+
+ investing
+ tactical
+ contrarian
+
+
+
+ 0.9 actionable
+
+
+ 0.8 novel
+
+
+ 0.95 clipable
+
+
+
+
+
+
+ {/* Features */}
+
+
+
+
Built for how you learn
+
Features that make podcast wisdom accessible
+
+
+
+
+
+
+
Voice search
+
"What did they say about raising a Series A?" Ask naturally, get clips.
+
+
+
+
+
+
Smart boundaries
+
Clips start and end at natural breaks. No awkward cuts or missing context.
+
+
+
+
+
+
Podcast DJ
+
After each clip, your guide recommends what to explore next based on your interests.
+
+
+
+
+
+ {/* CTA */}
+
+
+
Ready to skip the fluff?
+
Join the waitlist for early access. We're launching soon.
+
+
+
+
+
No spam. We'll only email you when we launch.
+
+
+
+ {/* Footer */}
+
+
+ );
+}
diff --git a/src/pages/Player.tsx b/src/pages/Player.tsx
new file mode 100644
index 000000000..21f14c07f
--- /dev/null
+++ b/src/pages/Player.tsx
@@ -0,0 +1,116 @@
+import { useState } from "react";
+import { useParams, Link } from "react-router-dom";
+import { categories } from "../data/categories";
+import { MediaPlayer } from "../components/MediaPlayer";
+
+export function Player() {
+ const { categoryId } = useParams<{ categoryId: string }>();
+ const [currentClipIndex, setCurrentClipIndex] = useState(0);
+
+ const category = categories.find((c) => c.id === categoryId);
+
+ if (!category) {
+ return (
+
+ );
+}
diff --git a/src/planner.py b/src/planner.py
new file mode 100644
index 000000000..614575159
--- /dev/null
+++ b/src/planner.py
@@ -0,0 +1,159 @@
+"""
+Planner Module - Breaks down user goals into sub-tasks
+Part of the PodVibe.fm Agentic AI System
+"""
+
+from typing import List, Dict
+from datetime import datetime
+
+
+class Planner:
+ """
+ Planner component that breaks down the summarization task into discrete sub-tasks.
+ Follows the ReAct (Reasoning + Acting) pattern.
+ """
+
+ def __init__(self):
+ """Initialize the planner with task templates"""
+ self.task_templates = {
+ 'summarize': [
+ {
+ 'step': 1,
+ 'action': 'extract_video_id',
+ 'description': 'Parse YouTube URL and extract video ID',
+ 'tool': 'url_parser',
+ 'status': 'pending'
+ },
+ {
+ 'step': 2,
+ 'action': 'fetch_transcript',
+ 'description': 'Retrieve video transcript from YouTube',
+ 'tool': 'youtube_api',
+ 'status': 'pending'
+ },
+ {
+ 'step': 3,
+ 'action': 'generate_summary',
+ 'description': 'Generate AI summary using Gemini',
+ 'tool': 'gemini_api',
+ 'status': 'pending'
+ },
+ {
+ 'step': 4,
+ 'action': 'extract_keywords',
+ 'description': 'Extract 10 semantic keywords from summary',
+ 'tool': 'keyword_extractor',
+ 'status': 'pending'
+ },
+ {
+ 'step': 5,
+ 'action': 'store_result',
+ 'description': 'Store summary and metadata',
+ 'tool': 'memory_store',
+ 'status': 'pending'
+ }
+ ]
+ }
+
+ def create_plan(self, user_input: Dict) -> List[Dict]:
+ """
+ Create an execution plan based on user input
+
+ Args:
+ user_input: Dictionary containing:
+ - url: YouTube video URL
+ - summary_type: Type of summary requested
+
+ Returns:
+ List of sub-tasks to execute
+ """
+ # For this agent, we use the 'summarize' template
+ plan = self.task_templates['summarize'].copy()
+
+ # Annotate plan with user context
+ plan_with_context = []
+ for task in plan:
+ task_copy = task.copy()
+ task_copy['context'] = {
+ 'url': user_input.get('url'),
+ 'summary_type': user_input.get('summary_type', 'comprehensive'),
+ 'timestamp': datetime.now().isoformat()
+ }
+ plan_with_context.append(task_copy)
+
+ return plan_with_context
+
+ def update_task_status(self, plan: List[Dict], step: int, status: str, result: any = None) -> List[Dict]:
+ """
+ Update the status of a specific task in the plan
+
+ Args:
+ plan: Current execution plan
+ step: Step number to update
+ status: New status ('pending', 'in_progress', 'completed', 'failed')
+ result: Optional result data from task execution
+
+ Returns:
+ Updated plan
+ """
+ for task in plan:
+ if task['step'] == step:
+ task['status'] = status
+ task['updated_at'] = datetime.now().isoformat()
+ if result is not None:
+ task['result'] = result
+ break
+
+ return plan
+
+ def get_next_task(self, plan: List[Dict]) -> Dict:
+ """
+ Get the next pending task from the plan
+
+ Args:
+ plan: Current execution plan
+
+ Returns:
+ Next task to execute, or None if all tasks complete
+ """
+ for task in plan:
+ if task['status'] == 'pending':
+ return task
+ return None
+
+ def is_plan_complete(self, plan: List[Dict]) -> bool:
+ """
+ Check if all tasks in the plan are completed
+
+ Args:
+ plan: Current execution plan
+
+ Returns:
+ True if all tasks completed, False otherwise
+ """
+ return all(task['status'] == 'completed' for task in plan)
+
+ def get_plan_summary(self, plan: List[Dict]) -> Dict:
+ """
+ Generate a summary of the current plan state
+
+ Args:
+ plan: Current execution plan
+
+ Returns:
+ Dictionary with plan statistics
+ """
+ total_tasks = len(plan)
+ completed = sum(1 for task in plan if task['status'] == 'completed')
+ failed = sum(1 for task in plan if task['status'] == 'failed')
+ pending = sum(1 for task in plan if task['status'] == 'pending')
+ in_progress = sum(1 for task in plan if task['status'] == 'in_progress')
+
+ return {
+ 'total_tasks': total_tasks,
+ 'completed': completed,
+ 'failed': failed,
+ 'pending': pending,
+ 'in_progress': in_progress,
+ 'completion_rate': f"{(completed/total_tasks)*100:.1f}%"
+ }
diff --git a/src/requirements.txt b/src/requirements.txt
new file mode 100644
index 000000000..2e0e1ba5f
--- /dev/null
+++ b/src/requirements.txt
@@ -0,0 +1,6 @@
+google-generativeai>=0.3.0
+youtube-transcript-api>=0.6.0
+python-dotenv>=1.0.0
+streamlit>=1.12.0
+flask>=3.0.0
+flask-cors>=4.0.0
diff --git a/src/run.md b/src/run.md
new file mode 100644
index 000000000..ecbbf109a
--- /dev/null
+++ b/src/run.md
@@ -0,0 +1,17 @@
+Run streamlit:
+```bash
+cd src
+streamlit run app.py
+```
+
+Run better UI:
+```bash
+cd src
+python api.py
+```
+
+```bash
+cd frontend
+npm install
+npm run dev
+```
diff --git a/src/summary_aR20FWCCjAs.json b/src/summary_aR20FWCCjAs.json
new file mode 100644
index 000000000..a3f585a9c
--- /dev/null
+++ b/src/summary_aR20FWCCjAs.json
@@ -0,0 +1,16 @@
+{
+ "video_id": "aR20FWCCjAs",
+ "url": "https://www.youtube.com/watch?v=aR20FWCCjAs",
+ "transcript": "You know what's crazy? That all of this is real.\nMeaning what?Β Don't you think so? All this AI stuff andΒ \nall this Bay Areaβ¦ that it's happening.Β Isn't it straight out of science fiction?\nAnother thing that's crazy is howΒ Β normal the slow takeoff feels.\nThe idea that we'd be investing 1%Β Β of GDP in AI, I feel like it would have felt likeΒ \na bigger deal, whereas right now it just feels...Β We get used to things pretty fast, it turns out.\nBut also it's kind of abstract. What does itΒ Β mean? It means that you see it in the news,Β \nthat such and such company announced suchΒ Β and such dollar amount. That's all you see.Β \nIt's not really felt in any other way so far.Β Should we actually begin here? I thinkΒ \nthis is an interesting discussion.Β Sure.\nI think your point,Β Β about how from the average person's point of viewΒ \nnothing is that different, will continue beingΒ Β true even into the singularity.\nNo, I don't think so.Β Okay, interesting.\nThe thing which I was referringΒ Β to not feeling different is, okay, such and suchΒ \ncompany announced some difficult-to-comprehendΒ Β dollar amount of investment.\nI don't think anyone knows what to do with that.Β But I think the impact of AI is going to be felt.\nAI is going to be diffused through the economy.Β There'll be very strong economic forcesΒ \nfor this, and I think the impact isΒ Β going to be felt very strongly.\nWhen do you expect that impact?Β I think the models seem smarter thanΒ \ntheir economic impact would imply.Β Yeah. This is one of the very confusingΒ \nthings about the models right now.Β How to reconcile the fact thatΒ \nthey are doing so well on evals?Β You look at the evals and you go, \"ThoseΒ \nare pretty hard evals.\" They are doingΒ Β so well. But the economic impactΒ \nseems to be dramatically behind.Β It's very difficult to make sense of,Β \nhow can the model, on the one hand,Β Β do these amazing things, and then on the otherΒ \nhand, repeat itself twice in some situation?Β An example would be, let's say youΒ \nuse vibe coding to do something.Β You go to some place and then you get a bug.\nThen you tell the model,Β Β \"Can you please fix the bug?\"\nAnd the model says, \"Oh my God,Β Β you're so right. I have a bug. Let me goΒ \nfix that.\" And it introduces a second bug.Β Then you tell it, \"You have thisΒ \nnew second bug,\" and it tells you,Β Β \"Oh my God, how could I have done it?\nYou're so right again,\" and brings backΒ Β the first bug, and you can alternate betweenΒ \nthose. How is that possible? I'm not sure, but itΒ Β does suggest that something strange is going on. IΒ \nhave two possible explanations. The more whimsicalΒ Β explanation is that maybe RL training makes theΒ \nmodels a little too single-minded and narrowlyΒ Β focused, a little bit too unaware, even thoughΒ \nit also makes them aware in some other ways.Β Because of this, they can't do basic things.Β \nBut there is another explanation. Back whenΒ Β people were doing pre-training, theΒ \nquestion of what data to train on wasΒ Β answered, because that answer was everything.\nWhen you do pre-training, you need all the data.Β So you don't have to think if it'sΒ \ngoing to be this data or that data.Β But when people do RL training,Β \nthey do need to think.Β They say, \"Okay, we want to have thisΒ \nkind of RL training for this thingΒ Β and that kind of RL training for that thing.\"\nFrom what I hear, all the companies have teamsΒ Β that just produce new RL environmentsΒ \nand just add it to the training mix.Β The question is, well, what are those?\nThere are so many degrees of freedom.Β There is such a huge variety ofΒ \nRL environments you could produce.Β One thing you could do, and I think thisΒ \nis something that is done inadvertently,Β Β is that people take inspiration from the evals.\nYou say, \"Hey, I would love our model to doΒ Β really well when we release it.\nI want the evals to look great.Β What would be RL trainingΒ \nthat could help on this task?\"Β I think that is something that happens, andΒ \nit could explain a lot of what's going on.Β If you combine this with generalizationΒ \nof the models actually being inadequate,Β Β that has the potential to explain a lotΒ \nof what we are seeing, this disconnectΒ Β between eval performance and actual real-worldΒ \nperformance, which is something that we don'tΒ Β today even understand, what we mean by that.\nI like this idea that the real reward hackingΒ Β is the human researchers whoΒ \nare too focused on the evals.Β I think there are two ways toΒ \nunderstand, or to try to think about,Β Β what you have just pointed out.\nOne is that if it's the case thatΒ Β simply by becoming superhuman at a codingΒ \ncompetition, a model will not automaticallyΒ Β become more tasteful and exercise better judgmentΒ \nabout how to improve your codebase, well then youΒ Β should expand the suite of environments suchΒ \nthat you're not just testing it on havingΒ Β the best performance in coding competition.\nIt should also be able to make the best kindΒ Β of application for X thing or Y thing or Z thing.\nAnother, maybe this is what you're hinting at,Β Β is to say, \"Why should it be the case inΒ \nthe first place that becoming superhumanΒ Β at coding competitions doesn't make you aΒ \nmore tasteful programmer more generally?\"Β Maybe the thing to do is not to keepΒ \nstacking up the amount and diversityΒ Β of environments, but to figure out an approachΒ \nwhich lets you learn from one environment andΒ Β improve your performance on something else.\nI have a human analogy which might be helpful.Β Let's take the case of competitive programming,Β \nsince you mentioned that. Suppose you have twoΒ Β students. One of them decided they wantΒ \nto be the best competitive programmer, soΒ Β they will practice 10,000 hours for that domain.\nThey will solve all the problems, memorize all theΒ Β proof techniques, and be very skilled at quicklyΒ \nand correctly implementing all the algorithms.Β By doing so, they became one of the best.\nStudent number two thought, \"Oh,Β Β competitive programming is cool.\"\nMaybe they practiced for 100 hours,Β Β much less, and they also did really well.\nWhich one do you think is going to do betterΒ Β in their career later on?\nThe second.Β Right. I think that's basically what's going on.\nThe models are much more like theΒ Β first student, but even more.\nBecause then we say, the model shouldΒ Β be good at competitive programming so let's getΒ \nevery single competitive programming problem ever.Β And then let's do some data augmentationΒ \nso we have even more competitiveΒ Β programming problems, and we train on that.\nNow you've got this great competitive programmer.Β With this analogy, I think it's more intuitive.\nYeah, okay, if it's so well trained, all theΒ Β different algorithms and all the differentΒ \nproof techniques are right at its fingertips.Β And it's more intuitive that with thisΒ \nlevel of preparation, it would notΒ Β necessarily generalize to other things.\nBut then what is the analogy for whatΒ Β the second student is doing beforeΒ \nthey do the 100 hours of fine-tuning?Β I think they have \"it.\" The \"it\"Β \nfactor. When I was an undergrad,Β Β I remember there was a student like thisΒ \nthat studied with me, so I know it exists.Β I think it's interesting to distinguishΒ \n\"it\" from whatever pre-training does.Β One way to understand what you just saidΒ \nabout not having to choose the data inΒ Β pre-training is to say it's actually notΒ \ndissimilar to the 10,000 hours of practice.Β It's just that you get that 10,000 hoursΒ \nof practice for free because it's alreadyΒ Β somewhere in the pre-training distribution.\nBut maybe you're suggesting there's actuallyΒ Β not that much generalization from pre-training.\nThere's just so much data in pre-training, butΒ Β it's not necessarily generalizing better than RL.\nThe main strength of pre-training isΒ Β that: A, there is so much of it, and B,Β \nyou don't have to think hard about whatΒ Β data to put into pre-training.\nIt's very natural data, and itΒ Β does include in it a lot of what people do:Β \npeople's thoughts and a lot of the features.Β It's like the whole world as projected byΒ \npeople onto text, and pre-training triesΒ Β to capture that using a huge amount of data.\nPre-training is very difficult to reason aboutΒ Β because it's so hard to understand the mannerΒ \nin which the model relies on pre-training data.Β Whenever the model makes a mistake, could it beΒ \nbecause something by chance is not as supportedΒ Β by the pre-training data? \"Support byΒ \npre-training\" is maybe a loose term.Β I don't know if I can addΒ \nanything more useful on this.Β I don't think there is aΒ \nhuman analog to pre-training.Β Here are analogies that people have proposedΒ \nfor what the human analogy to pre-training is.Β I'm curious to get your thoughtsΒ \non why they're potentially wrong.Β One is to think about the first 18, or 15,Β \nor 13 years of a person's life when theyΒ Β aren't necessarily economically productive,Β \nbut they are doing something that is makingΒ Β them understand the world better and so forth.\nThe other is to think about evolution as doingΒ Β some kind of search for 3 billion years, whichΒ \nthen results in a human lifetime instance.Β I'm curious if you think either ofΒ \nthese are analogous to pre-training.Β How would you think about what lifetimeΒ \nhuman learning is like, if not pre-training?Β I think there are some similarities between bothΒ \nof these and pre-training, and pre-training triesΒ Β to play the role of both of these.\nBut I think there are someΒ Β big differences as well.\nThe amount of pre-training data is very,Β Β very staggering.\nYes.Β Somehow a human being, after even 15 yearsΒ \nwith a tiny fraction of the pre-trainingΒ Β data, they know much less.\nBut whatever they do know,Β Β they know much more deeply somehow.\nAlready at that age, you would notΒ Β make mistakes that our AIs make. There is anotherΒ \nthing. You might say, could it be something likeΒ Β evolution? The answer is maybe. But in this case,Β \nI think evolution might actually have an edge.Β I remember reading about this case.\nOne way in which neuroscientists canΒ Β learn about the brain is by studying people withΒ \nbrain damage to different parts of the brain.Β Some people have the most strange symptomsΒ \nyou could imagine. It's actually really,Β Β really interesting. One case thatΒ \ncomes to mind that's relevant.Β I read about this person who had some kindΒ \nof brain damage, a stroke or an accident,Β Β that took out his emotional processing.\nSo he stopped feeling any emotion.Β He still remained very articulateΒ \nand he could solve little puzzles,Β Β and on tests he seemed to be just fine.Β \nBut he felt no emotion. He didn't feel sad,Β Β he didn't feel anger, he didn't feel animated.\nHe became somehow extremely bad at making anyΒ Β decisions at all.\nIt would take himΒ Β hours to decide on which socks to wear.\nHe would make very bad financial decisions.Β What does it say about the role of our built-inΒ \nemotions in making us a viable agent, essentially?Β To connect to your question about pre-training,Β \nmaybe if you are good enough at getting everythingΒ Β out of pre-training, you could get that as well.\nBut that's the kind of thing which seems...Β Well, it may or may not be possibleΒ \nto get that from pre-training.Β What is \"that\"? Clearly not just directlyΒ \nemotion. It seems like some almost valueΒ Β function-like thing which is telling you whatΒ \nthe end reward for any decision should be.Β You think that doesn't sort ofΒ \nimplicitly come from pre-training?Β I think it could. I'm justΒ \nsaying it's not 100% obvious.Β But what is that? How do you think about emotions?\nWhat is the ML analogy for emotions?Β It should be some kind of a value function thing.\nBut I donβt think there is a great ML analogyΒ Β because right now, value functions don't playΒ \na very prominent role in the things people do.Β It might be worth defining for the audience whatΒ \na value function is, if you want to do that.Β Certainly, I'll be very happy to do that.\nWhen people do reinforcement learning,Β Β the way reinforcement learning is doneΒ \nright now, how do people train those agents?Β You have your neural net and youΒ \ngive it a problem, and then youΒ Β tell the model, \"Go solve it.\"\nThe model takes maybe thousands,Β Β hundreds of thousands of actions or thoughts orΒ \nsomething, and then it produces a solution. TheΒ Β solution is graded. And then the scoreΒ \nis used to provide a training signalΒ Β for every single action in your trajectory.\nThat means that if you are doing somethingΒ Β that goes for a long timeβif you're trainingΒ \na task that takes a long time to solveβitΒ Β will do no learning at all until youΒ \ncome up with the proposed solution.Β That's how reinforcement learning is done naively.\nThat's how o1, R1 ostensibly are done.Β The value function says something like,Β \n\"Maybe I could sometimes, not always,Β Β tell you if you are doing well or badly.\"\nThe notion of a value function is moreΒ Β useful in some domains than others.\nFor example, when you play chess andΒ Β you lose a piece, I messed up.\nYou don't need to play the wholeΒ Β game to know that what I just did was bad, andΒ \ntherefore whatever preceded it was also bad.Β The value function lets you short-circuitΒ \nthe wait until the very end.Β Let's suppose that you are doing some kindΒ \nof a math thing or a programming thing,Β Β and you're trying to explore aΒ \nparticular solution or direction.Β After, let's say, a thousand steps of thinking,Β \nyou concluded that this direction is unpromising.Β As soon as you conclude this, youΒ \ncould already get a reward signalΒ Β a thousand timesteps previously, whenΒ \nyou decided to pursue down this path.Β You say, \"Next time I shouldn't pursue thisΒ \npath in a similar situation,\" long before youΒ Β actually came up with the proposed solution.\nThis was in the DeepSeek R1 paperβ that theΒ Β space of trajectories is so wide thatΒ \nmaybe it's hard to learn a mappingΒ Β from an intermediate trajectory and value.\nAnd also given that, in coding for exampleΒ Β you'll have the wrong idea, then you'llΒ \ngo back, then you'll change something.Β This sounds like such lackΒ \nof faith in deep learning.Β Sure it might be difficult, butΒ \nnothing deep learning can't do.Β My expectation is that a value function shouldΒ \nbe useful, and I fully expect that they willΒ Β be used in the future, if not already.\nWhat I was alluding to with the personΒ Β whose emotional center got damaged, itβs moreΒ \nthat maybe what it suggests is that the valueΒ Β function of humans is modulated by emotions inΒ \nsome important way that's hardcoded by evolution.Β And maybe that is important forΒ \npeople to be effective in the world.Β That's the thing I was planning on asking you.Β There's something really interesting aboutΒ \nemotions of the value function, which is thatΒ Β it's impressive that they have this much utilityΒ \nwhile still being rather simple to understand.Β I have two responses. I do agree that compared toΒ \nthe kind of things that we learn and the thingsΒ Β we are talking about, the kind of AI we areΒ \ntalking about, emotions are relatively simple.Β They might even be so simple that maybe youΒ \ncould map them out in a human-understandable way.Β I think it would be cool to do.\nIn terms of utility though,Β Β I think there is a thing where there is thisΒ \ncomplexity-robustness tradeoff, where complexΒ Β things can be very useful, but simple things areΒ \nvery useful in a very broad range of situations.Β One way to interpret what we are seeing is thatΒ \nwe've got these emotions that evolved mostlyΒ Β from our mammal ancestors and then fine-tuned aΒ \nlittle bit while we were hominids, just a bit.Β We do have a decent amount of social emotionsΒ \nthough which mammals may lack. But they'reΒ Β not very sophisticated. And because they'reΒ \nnot sophisticated, they serve us so well inΒ Β this very different world compared to theΒ \none that we've been living in. Actually,Β Β they also make mistakes. For example, ourΒ \nemotionsβ¦ Well actually, I donβt know.Β Does hunger count as an emotion? It's debatable.Β \nBut I think, for example, our intuitive feelingΒ Β of hunger is not succeeding in guiding usΒ \ncorrectly in this world with an abundance of food.Β People have been talking about scalingΒ \ndata, scaling parameters, scaling compute.Β Is there a more generalΒ \nway to think about scaling?Β What are the other scaling axes?\nHere's a perspective that I think might be true.Β The way ML used to work is thatΒ \npeople would just tinker withΒ Β stuff and try to get interesting results.\nThat's what's been going on in the past. ThenΒ Β the scaling insight arrived. Scaling laws, GPT-3,Β \nand suddenly everyone realized we should scale.Β This is an example of how languageΒ \naffects thought. \"Scaling\" is justΒ Β one word, but it's such a powerful wordΒ \nbecause it informs people what to do.Β They say, \"Let's try to scale things.\"\nSo you say, what are we scaling?Β Pre-training was the thing to scale.\nIt was a particular scaling recipe.Β The big breakthrough of pre-training isΒ \nthe realization that this recipe is good.Β You say, \"Hey, if you mix some computeΒ \nwith some data into a neural net ofΒ Β a certain size, you will get results.\nYou will know that you'll be better if youΒ Β just scale the recipe up.\" This is also great.Β \nCompanies love this because it gives you a veryΒ Β low-risk way of investing your resources.\nIt's much harder to invest your resourcesΒ Β in research. Compare that. If you research,Β \nyou need to be like, \"Go forth researchersΒ Β and research and come up with something\",Β \nversus get more data, get more compute.Β You know you'll get something from pre-training.\nIndeed, it looks like, based on variousΒ Β things some people say on Twitter, maybe itΒ \nappears that Gemini have found a way to getΒ Β more out of pre-training.\nAt some point though,Β Β pre-training will run out of data.\nThe data is very clearly finite. WhatΒ Β do you do next? Either you do some kindΒ \nof souped-up pre-training, a differentΒ Β recipe from the one you've done before, orΒ \nyou're doing RL, or maybe something else.Β But now that compute is big, computeΒ \nis now very big, in some sense weΒ Β are back to the age of research.\nMaybe here's another way to put it.Β Up until 2020, from 2012 toΒ \n2020, it was the age of research.Β Now, from 2020 to 2025, it was theΒ \nage of scalingβmaybe plus or minus,Β Β let's add error bars to those yearsβbecauseΒ \npeople say, \"This is amazing. You've got toΒ Β scale more. Keep scaling.\" The one word:Β \nscaling. But now the scale is so big.Β Is the belief really, \"Oh, it's so big, but if youΒ \nhad 100x more, everything would be so different?\"Β It would be different, for sure.\nBut is the belief that if you justΒ Β 100x the scale, everything would be transformed?Β \nI don't think that's true. So it's back to the ageΒ Β of research again, just with big computers.\nThat's a very interesting way to put it.Β But let me ask you theΒ \nquestion you just posed then.Β What are we scaling, and whatΒ \nwould it mean to have a recipe?Β I guess I'm not aware of a very cleanΒ \nrelationship that almost looks like a lawΒ Β of physics which existed in pre-training.\nThere was a power law between data orΒ Β compute or parameters and loss.\nWhat is the kind of relationshipΒ Β we should be seeking, and how should we thinkΒ \nabout what this new recipe might look like?Β We've already witnessed a transition from oneΒ \ntype of scaling to a different type of scaling,Β Β from pre-training to RL. Now people are scalingΒ \nRL. Now based on what people say on Twitter,Β Β they spend more compute on RL than onΒ \npre-training at this point, because RLΒ Β can actually consume quite a bit of compute.\nYou do very long rollouts, so it takes a lotΒ Β of compute to produce those rollouts.\nThen you get a relatively small amountΒ Β of learning per rollout, so youΒ \nreally can spend a lot of compute.Β I wouldn't even call it scaling.\nI would say, \"Hey, what are you doing?Β Is the thing you are doing the mostΒ \nproductive thing you could be doing?Β Can you find a more productiveΒ \nway of using your compute?\"Β We've discussed the valueΒ \nfunction business earlier.Β Maybe once people get good at valueΒ \nfunctions, they will be using theirΒ Β resources more productively.\nIf you find a whole other wayΒ Β of training models, you could say, \"Is thisΒ \nscaling or is it just using your resources?\"Β I think it becomes a little bit ambiguous.\nIn the sense that, when people were in theΒ Β age of research back then, it was,Β \n\"Let's try this and this and this.Β Let's try that and that and that.\nOh, look, something interesting is happening.\"Β I think there will be a return to that.\nIf we're back in the era of research,Β Β stepping back, what is the part of theΒ \nrecipe that we need to think most about?Β When you say value function, peopleΒ \nare already trying the current recipe,Β Β but then having LLM-as-a-Judge and so forth.\nYou could say that's a value function,Β Β but it sounds like you have somethingΒ \nmuch more fundamental in mind.Β Should we even rethink pre-training at all and notΒ \njust add more steps to the end of that process?Β The discussion about value function,Β \nI think it was interesting.Β I want to emphasize that I think the valueΒ \nfunction is something that's going to make RL moreΒ Β efficient, and I think that makes a difference.\nBut I think anything you can do with a valueΒ Β function, you can do without, just more slowly.\nThe thing which I think is the most fundamentalΒ Β is that these models somehow just generalizeΒ \ndramatically worse than people. It's superΒ Β obvious. That seems like a very fundamental thing.\nSo this is the crux: generalization. There are twoΒ Β sub-questions. There's one which is about sampleΒ \nefficiency: why should it take so much more dataΒ Β for these models to learn than humans? There'sΒ \na second question. Even separate from the amountΒ Β of data it takes, why is it so hard to teachΒ \nthe thing we want to a model than to a human?Β For a human, we don't necessarily need aΒ \nverifiable reward to be able toβ¦ You're probablyΒ Β mentoring a bunch of researchers right now, andΒ \nyou're talking with them, you're showing them yourΒ Β code, and you're showing them how you think.\nFrom that, they're picking up your way ofΒ Β thinking and how they should do research.\nYou donβt have to set a verifiable reward forΒ Β them that's like, \"Okay, this is the next part ofΒ \nthe curriculum, and now this is the next part ofΒ Β your curriculum. Oh, this training was unstable.\"Β \nThere's not this schleppy, bespoke process.Β Perhaps these two issues are actuallyΒ \nrelated in some way, but I'd be curiousΒ Β to explore this second thing, which is moreΒ \nlike continual learning, and this first thing,Β Β which feels just like sample efficiency.\nYou could actually wonder that one possibleΒ Β explanation for the human sample efficiencyΒ \nthat needs to be considered is evolution.Β Evolution has given us a small amountΒ \nof the most useful information possible.Β For things like vision, hearing, andΒ \nlocomotion, I think there's a prettyΒ Β strong case that evolution has given us a lot.\nFor example, human dexterity far exceedsβ¦ I meanΒ Β robots can become dexterous too if you subjectΒ \nthem to a huge amount of training in simulation.Β But to train a robot in the real worldΒ \nto quickly pick up a new skill likeΒ Β a person does seems very out of reach.\nHere you could say, \"Oh yeah, locomotion.Β All our ancestors neededΒ \ngreat locomotion, squirrels.Β So with locomotion, maybe we'veΒ \ngot some unbelievable prior.\"Β You could make the same case for vision.\nI believe Yann LeCun made the point thatΒ Β children learn to drive after 10Β \nhours of practice, which is true.Β But our vision is so good.\nAt least for me,Β Β I remember myself being a five-year-old.\nI was very excited about cars back then.Β I'm pretty sure my car recognition was more thanΒ \nadequate for driving already as a five-year-old.Β You don't get to see thatΒ \nmuch data as a five-year-old.Β You spend most of your time in your parents'Β \nhouse, so you have very low data diversity.Β But you could say maybe that's evolution too.\nBut in language and math and coding, probably not.Β It still seems better than models.\nObviously, models are better than the averageΒ Β human at language, math, and coding.\nBut are they better thanΒ Β the average human at learning?\nOh yeah. Oh yeah, absolutely. What I meantΒ Β to say is that language, math, and codingβandΒ \nespecially math and codingβsuggests that whateverΒ Β it is that makes people good at learning isΒ \nprobably not so much a complicated prior,Β Β but something more, some fundamental thing.\nI'm not sure I understood. WhyΒ Β should that be the case?\nSo consider a skill in whichΒ Β people exhibit some kind of great reliability.\nIf the skill is one that was very useful to ourΒ Β ancestors for many millions of years, hundredsΒ \nof millions of years, you could argue that maybeΒ Β humans are good at it because of evolution,Β \nbecause we have a prior, an evolutionary priorΒ Β that's encoded in some very non-obviousΒ \nway that somehow makes us so good at it.Β But if people exhibit great ability, reliability,Β \nrobustness, and ability to learn in a domain thatΒ Β really did not exist until recently, thenΒ \nthis is more an indication that peopleΒ Β might have just better machine learning, period.\nHow should we think about what that is? What isΒ Β the ML analogy? There are a couple of interestingΒ \nthings about it. It takes fewer samples. It'sΒ Β more unsupervised. A child learning to drive aΒ \ncarβ¦ Children are not learning to drive a car.Β A teenager learning how to drive a car is notΒ \nexactly getting some prebuilt, verifiable reward.Β It comes from their interaction withΒ \nthe machine and with the environment.Β Β It takes much fewer samples. It seemsΒ \nmore unsupervised. It seems more robust?Β Much more robust. The robustnessΒ \nof people is really staggering.Β Do you have a unified way of thinking aboutΒ \nwhy all these things are happening at once?Β What is the ML analogy that couldΒ \nrealize something like this?Β One of the things that you've been asking about isΒ \nhow can the teenage driver self-correct and learnΒ Β from their experience without an external teacher?\nThe answer is that they have their value function.Β They have a general sense which is also,Β \nby the way, extremely robust in people.Β Whatever the human value function is,Β \nwith a few exceptions around addiction,Β Β it's actually very, very robust.\nSo for something like a teenagerΒ Β that's learning to drive, they start to drive, andΒ \nthey already have a sense of how they're drivingΒ Β immediately, how badly they are, how unconfident.Β \nAnd then they see, \"Okay.\" And then, of course,Β Β the learning speed of any teenager is so fast.\nAfter 10 hours, you're good to go.Β It seems like humans have someΒ \nsolution, but I'm curious aboutΒ Β how they are doing it and why is it so hard?\nHow do we need to reconceptualize the wayΒ Β we're training models to makeΒ \nsomething like this possible?Β That is a great question to ask, and it'sΒ \na question I have a lot of opinions about.Β But unfortunately, we live in a world whereΒ \nnot all machine learning ideas are discussedΒ Β freely, and this is one of them.\nThere's probably a way to do it.Β I think it can be done.\nThe fact that people are like that,Β Β I think it's a proof that it can be done.\nThere may be another blocker though,Β Β which is that there is a possibility that theΒ \nhuman neurons do more compute than we think.Β If that is true, and if that plays an importantΒ \nrole, then things might be more difficult.Β But regardless, I do think it points toΒ \nthe existence of some machine learningΒ Β principle that I have opinions on.\nBut unfortunately, circumstancesΒ Β make it hard to discuss in detail.\nNobody listens to this podcast, Ilya.Β I'm curious. If you say we are back in an eraΒ \nof research, you were there from 2012 to 2020.Β What is the vibe now going to be ifΒ \nwe go back to the era of research?Β For example, even after AlexNet, theΒ \namount of compute that was used toΒ Β run experiments kept increasing, and theΒ \nsize of frontier systems kept increasing.Β Do you think now that this era of research willΒ \nstill require tremendous amounts of compute?Β Do you think it will require going backΒ \ninto the archives and reading old papers?Β You were at Google and OpenAI and Stanford, theseΒ \nplaces, when there was more of a vibe of research?Β What kind of things should weΒ \nbe expecting in the community?Β One consequence of the age of scaling is thatΒ \nscaling sucked out all the air in the room.Β Because scaling sucked out all the air in theΒ \nroom, everyone started to do the same thing.Β We got to the point where we areΒ \nin a world where there are moreΒ Β companies than ideas by quite a bit.\nActually on that, there is this SiliconΒ Β Valley saying that says that ideasΒ \nare cheap, execution is everything.Β People say that a lot, and there is truth to that.\nBut then I saw someone say on TwitterΒ Β something like, \"If ideas are so cheap,Β \nhow come no one's having any ideas?\"Β And I think it's true too.\nIf you think about research progress in termsΒ Β of bottlenecks, there are several bottlenecks.\nOne of them is ideas, and one of them is yourΒ Β ability to bring them to life, whichΒ \nmight be compute but also engineering.Β If you go back to the '90s, let's say,Β \nyou had people who had pretty good ideas,Β Β and if they had much larger computers, maybe theyΒ \ncould demonstrate that their ideas were viable.Β But they could not, so they could onlyΒ \nhave a very, very small demonstrationΒ Β that did not convince anyone. So theΒ \nbottleneck was compute. Then in theΒ Β age of scaling, compute has increased a lot.\nOf course, there is a question of how muchΒ Β compute is needed, but compute is large.\nCompute is large enough such that it's notΒ Β obvious that you need that much moreΒ \ncompute to prove some idea. I'll giveΒ Β you an analogy. AlexNet was built on two GPUs.\nThat was the total amount of compute used for it.Β The transformer was built on 8 to 64 GPUs.\nNo single transformer paper experiment usedΒ Β more than 64 GPUs of 2017, which would beΒ \nlike, what, two GPUs of today? The ResNet,Β Β right? You could argue that the o1 reasoning wasΒ \nnot the most compute-heavy thing in the world.Β So for research, you definitely needΒ \nsome amount of compute, but it's farΒ Β from obvious that you need the absolutelyΒ \nlargest amount of compute ever for research.Β You might argue, and I think it is true, thatΒ \nif you want to build the absolutely best systemΒ Β then it helps to have much more compute.\nEspecially if everyone is within the sameΒ Β paradigm, then compute becomesΒ \none of the big differentiators.Β I'm asking you for the history,Β \nbecause you were actually there.Β I'm not sure what actually happened.\nIt sounds like it was possible to developΒ Β these ideas using minimal amounts of compute.\nBut the transformer didn'tΒ Β immediately become famous.\nIt became the thing everybody startedΒ Β doing and then started experimenting on top ofΒ \nand building on top of because it was validatedΒ Β at higher and higher levels of compute.\nCorrect.Β And if you at SSI have 50 different ideas, howΒ \nwill you know which one is the next transformerΒ Β and which one is brittle, without having theΒ \nkinds of compute that other frontier labs have?Β I can comment on that. The shortΒ \ncomment is that you mentioned SSI.Β Specifically for us, the amount of computeΒ \nthat SSI has for research is really not thatΒ Β small. I want to explain why. Simple mathΒ \ncan explain why the amount of compute thatΒ Β we have is comparable for research than one mightΒ \nthink. I'll explain. SSI has raised $3 billion,Β Β which is a lot by any absolute sense.\nBut you could say, \"Look at theΒ Β other companies raising much more.\"\nBut a lot of their compute goes for inference.Β These big numbers, these big loans, it'sΒ \nearmarked for inference. That's number one.Β Β Number two, if you want to have a productΒ \non which you do inference, you need toΒ Β have a big staff of engineers, salespeople.\nA lot of the research needs to be dedicated toΒ Β producing all kinds of product-related features.\nSo then when you look at what's actually left forΒ Β research, the difference becomes a lot smaller.\nThe other thing is, if you are doing somethingΒ Β different, do you really need theΒ \nabsolute maximal scale to prove it?Β I don't think that's true at all.\nI think that in our case, we have sufficientΒ Β compute to prove, to convince ourselves andΒ \nanyone else, that what we are doing is correct.Β There have been public estimates that companiesΒ \nlike OpenAI spend on the order of $5-6 billionΒ Β a year just so far, on experiments.\nThis is separate from the amount ofΒ Β money they're spending on inference and so forth.\nSo it seems like they're spending more a yearΒ Β running research experiments thanΒ \nyou guys have in total funding.Β I think it's a question of what you do with it.\nIt's a question of what you do with it.Β In their case, in the case of others, thereΒ \nis a lot more demand on the training compute.Β Thereβs a lot more different work streams, thereΒ \nare different modalities, there is just moreΒ Β stuff. So it becomes fragmented.\nHow will SSI make money?Β My answer to this question is something like this.\nRight now, we just focus on the research, and thenΒ Β the answer to that question will reveal itself.\nI think there will be lots of possible answers.Β Is SSI's plan still to straightΒ \nshot superintelligence?Β Maybe. I think that there is merit to it.\nI think there's a lot of merit becauseΒ Β it's very nice to not be affected byΒ \nthe day-to-day market competition.Β But I think there are two reasonsΒ \nthat may cause us to change the plan.Β One is pragmatic, if timelines turnedΒ \nout to be long, which they might.Β Second, I think there is a lotΒ \nof value in the best and mostΒ Β powerful AI being out there impacting the world.\nI think this is a meaningfully valuable thing.Β So then why is your default planΒ \nto straight shot superintelligence?Β Because it sounds like OpenAI, Anthropic, allΒ \nthese other companies, their explicit thinkingΒ Β is, \"Look, we have weaker and weaker intelligencesΒ \nthat the public can get used to and prepare for.\"Β Why is it potentially better toΒ \nbuild a superintelligence directly?Β I'll make the case for and against.\nThe case for is that one of the challengesΒ Β that people face when they're in the market isΒ \nthat they have to participate in the rat race.Β The rat race is quite difficult inΒ \nthat it exposes you to difficultΒ Β trade-offs which you need to make.\nIt is nice to say, \"We'll insulate ourselvesΒ Β from all this and just focus on the research andΒ \ncome out only when we are ready, and not before.\"Β But the counterpoint is valid too,Β \nand those are opposing forces.Β The counterpoint is, \"Hey, it is usefulΒ \nfor the world to see powerful AI.Β It is useful for the world toΒ \nsee powerful AI because that'sΒ Β the only way you can communicate it.\"\nWell, I guess not even just that you canΒ Β communicate the ideaβ\nCommunicate the AI,Β Β not the idea. Communicate the AI.\nWhat do you mean, \"communicate the AI\"?Β Let's suppose you write an essay about AI, andΒ \nthe essay says, \"AI is going to be this, and AI isΒ Β going to be that, and it's going to be this.\"\nYou read it and you say, \"Okay,Β Β this is an interesting essay.\"\nNow suppose you see an AI doing this,Β Β an AI doing that. It is incomparable. BasicallyΒ \nI think that there is a big benefit from AIΒ Β being in the public, and that would be aΒ \nreason for us to not be quite straight shot.Β I guess it's not even that, but I doΒ \nthink that is an important part of it.Β The other big thing is that I can't think ofΒ \nanother discipline in human engineering andΒ Β research where the end artifact was madeΒ \nsafer mostly through just thinking aboutΒ Β how to make it safe, as opposed to,Β \nwhy airplane crashes per mile are soΒ Β much lower today than they were decades ago.\nWhy is it so much harder to find a bug in LinuxΒ Β than it would have been decades ago?\nI think it's mostly because theseΒ Β systems were deployed to the world.\nYou noticed failures, those failuresΒ Β were corrected and the systems became more robust.\nI'm not sure why AGI and superhuman intelligenceΒ Β would be any different, especially givenβand IΒ \nhope we're going to get to thisβit seems likeΒ Β the harms of superintelligence are not just aboutΒ \nhaving some malevolent paper clipper out there.Β But this is a really powerful thing and we don'tΒ \neven know how to conceptualize how people interactΒ Β with it, what people will do with it.\nHaving gradual access to it seems like aΒ Β better way to maybe spread out the impactΒ \nof it and to help people prepare for it.Β Well I think on this point, even in the straightΒ \nshot scenario, you would still do a gradualΒ Β release of it, thatβs how I would imagine it.\nGradualism would be an inherentΒ Β component of any plan.\nIt's just a question of what is the firstΒ Β thing that you get out of the door. That's numberΒ \none. Number two, I believe you have advocatedΒ Β for continual learning more than other people,Β \nand I actually think that this is an importantΒ Β and correct thing. Here is why. I'll give youΒ \nanother example of how language affects thinking.Β In this case, it will be two words thatΒ \nhave shaped everyone's thinking, I maintain.Β Β First word: AGI. Second word: pre-training.Β \nLet me explain. The term AGI, why does thisΒ Β term exist? It's a very particular term. WhyΒ \ndoes it exist? There's a reason. The reasonΒ Β that the term AGI exists is, in my opinion, notΒ \nso much because it's a very important, essentialΒ Β descriptor of some end state of intelligence, butΒ \nbecause it is a reaction to a different term thatΒ Β existed, and the term is narrow AI.\nIf you go back to ancient historyΒ Β of gameplay and AI, of checkers AI, chessΒ \nAI, computer games AI, everyone would say,Β Β look at this narrow intelligence.\nSure, the chess AI can beat Kasparov,Β Β but it can't do anything else.\nIt is so narrow, artificial narrow intelligence.Β So in response, as a reaction to this,Β \nsome people said, this is not good. ItΒ Β is so narrow. What we need is general AI,Β \nan AI that can just do all the things.Β That term just got a lot of traction.\nThe second thing that got a lot of tractionΒ Β is pre-training, specificallyΒ \nthe recipe of pre-training.Β I think the way people do RL now is maybeΒ \nundoing the conceptual imprint of pre-training.Β Β But pre-training had this property. YouΒ \ndo more pre-training and the model getsΒ Β better at everything, more or less uniformly.Β \nGeneral AI. Pre-training gives AGI. But theΒ Β thing that happened with AGI and pre-trainingΒ \nis that in some sense they overshot the target.Β If you think about the term \"AGI\",Β \nespecially in the context of pre-training,Β Β you will realize that a human being is not an AGI.\nYes, there is definitely a foundation of skills,Β Β but a human being lacks aΒ \nhuge amount of knowledge.Β Instead, we rely on continual learning.\nSo when you think about, \"Okay,Β Β so let's suppose that we achieve success and weΒ \nproduce some kind of safe superintelligence.\"Β The question is, how do you define it?\nWhere on the curve of continualΒ Β learning is it going to be?\nI produce a superintelligentΒ Β 15-year-old that's very eager to go.\nThey don't know very much at all,Β Β a great student, very eager.\nYou go and be a programmer,Β Β you go and be a doctor, go and learn.\nSo you could imagine that the deploymentΒ Β itself will involve some kind ofΒ \na learning trial-and-error period.Β It's a process, as opposed toΒ \nyou dropping the finished thing.Β I see. You're suggesting that the thingΒ \nyou're pointing out with superintelligenceΒ Β is not some finished mind which knows howΒ \nto do every single job in the economy.Β Because the way, say, the original OpenAI charterΒ \nor whatever defines AGI is like, it can do everyΒ Β single job, every single thing a human can do.\nYou're proposing instead a mind which canΒ Β learn to do every single job,Β \nand that is superintelligence.Β Yes.\nBut once you have the learning algorithm,Β Β it gets deployed into the world the same wayΒ \na human laborer might join an organization.Β Exactly.\nIt seems like one of these two thingsΒ Β might happen, maybe neither of these happens.\nOne, this super-efficient learning algorithmΒ Β becomes superhuman, becomes as goodΒ \nas you and potentially even better,Β Β at the task of ML research.\nAs a result the algorithmΒ Β itself becomes more and more superhuman.\nThe other is, even if that doesn't happen,Β Β if you have a single modelβthis is explicitlyΒ \nyour visionβwhere instances of a modelΒ Β which are deployed through the economy doingΒ \ndifferent jobs, learning how to do those jobs,Β Β continually learning on the job, picking upΒ \nall the skills that any human could pick up,Β Β but picking them all up at the same time,Β \nand then amalgamating their learnings,Β Β you basically have a model which functionallyΒ \nbecomes superintelligent even without any sortΒ Β of recursive self-improvement in software.\nBecause you now have one model that can doΒ Β every single job in the economy and humansΒ \ncan't merge our minds in the same way.Β So do you expect some sort of intelligenceΒ \nexplosion from broad deployment?Β I think that it is likely that weΒ \nwill have rapid economic growth.Β I think with broad deployment, there are twoΒ \narguments you could make which are conflicting.Β One is that once indeed you get to a point whereΒ \nyou have an AI that can learn to do things quicklyΒ Β and you have many of them, then there will beΒ \na strong force to deploy them in the economyΒ Β unless there will be some kind of a regulationΒ \nthat stops it, which by the way there might be.Β But the idea of very rapidΒ \neconomic growth for some time,Β Β I think itβs very possible from broad deployment.\nThe question is how rapid it's going to be.Β I think this is hard to know because on theΒ \none hand you have this very efficient worker.Β On the other hand, the world is justΒ \nreally big and there's a lot of stuff,Β Β and that stuff moves at a different speed.\nBut then on the other hand, now the AI couldβ¦Β Β So I think very rapid economic growth is possible.\nWe will see all kinds of things like differentΒ Β countries with different rules and theΒ \nones which have the friendlier rules, theΒ Β economic growth will be faster. Hard to predict.\nIt seems to me that this is a very precariousΒ Β situation to be in.\nIn the limit,Β Β we know that this should be possible.\nIf you have something that is as goodΒ Β as a human at learning, but which can merge itsΒ \nbrainsβmerge different instances in a way thatΒ Β humans can't mergeβalready, this seems likeΒ \na thing that should physically be possible.Β Humans are possible, digitalΒ \ncomputers are possible.Β You just need both of thoseΒ \ncombined to produce this thing.Β It also seems this kind ofΒ \nthing is extremely powerful.Β Economic growth is one way to put it.\nA Dyson sphere is a lot of economic growth.Β But another way to put it is that you will have,Β \nin potentially a very short period of time...Β You hire people at SSI, and in sixΒ \nmonths, they're net productive, probably.Β A human learns really fast, and this thingΒ \nis becoming smarter and smarter very fast.Β How do you think about making that go well?\nWhy is SSI positioned to do that well?Β What is SSI's plan there, isΒ \nbasically what I'm trying to ask.Β One of the ways in which my thinking has beenΒ \nchanging is that I now place more importance onΒ Β AI being deployed incrementally and in advance.\nOne very difficult thing about AI is that we areΒ Β talking about systems that don't yetΒ \nexist and it's hard to imagine them.Β I think that one of the things that's happening isΒ \nthat in practice, it's very hard to feel the AGI.Β It's very hard to feel the AGI.\nWe can talk about it, but imagineΒ Β having a conversation about how it isΒ \nlike to be old when you're old and frail.Β You can have a conversation, you can try toΒ \nimagine it, but it's just hard, and you comeΒ Β back to reality where that's not the case.\nI think that a lot of the issues around AGIΒ Β and its future power stem from the factΒ \nthat it's very difficult to imagine.Β Future AI is going to be different. It's goingΒ \nto be powerful. Indeed, the whole problem,Β Β what is the problem of AI and AGI?\nThe whole problem is the power.Β The whole problem is the power.\nWhen the power is really big,Β Β what's going to happen?\nOne of the ways in which I'veΒ Β changed my mind over the past yearβand thatΒ \nchange of mind, I'll hedge a little bit, mayΒ Β back-propagate into the plans of our companyβisΒ \nthat if it's hard to imagine, what do you do?Β Youβve got to be showing the thing.\nYouβve got to be showing the thing.Β I maintain that most people who work on AI alsoΒ \ncan't imagine it because it's too different fromΒ Β what people see on a day-to-day basis.\nI do maintain, here's something whichΒ Β I predict will happen. This is a prediction.Β \nI maintain that as AI becomes more powerful,Β Β people will change their behaviors.\nWe will see all kinds of unprecedentedΒ Β things which are not happening right now. IβllΒ \ngive some examples. I think for better or worse,Β Β the frontier companies will play a very importantΒ \nrole in what happens, as will the government.Β The kind of things that I thinkΒ \nyou'll see, which you see theΒ Β beginnings of, are companies that are fierceΒ \ncompetitors starting to collaborate on AI safety.Β You may have seen OpenAI and Anthropic doingΒ \na first small step, but that did not exist.Β That's something which I predicted inΒ \none of my talks about three years ago,Β Β that such a thing will happen.\nI also maintain that as AI continuesΒ Β to become more powerful, more visiblyΒ \npowerful, there will also be a desire fromΒ Β governments and the public to do something.\nI think this is a very important force,Β Β of showing the AI. That's number one.Β \nNumber two, okay, so the AI is beingΒ Β built. What needs to be done? One thing thatΒ \nI maintain that will happen is that right now,Β Β people who are working on AI, I maintain that theΒ \nAI doesn't feel powerful because of its mistakes.Β I do think that at some point the AIΒ \nwill start to feel powerful actually.Β I think when that happens, we will see a bigΒ \nchange in the way all AI companies approachΒ Β safety. They'll become much more paranoid.Β \nI say this as a prediction that we willΒ Β see happen. We'll see if I'm right. But I thinkΒ \nthis is something that will happen because theyΒ Β will see the AI becoming more powerful.\nEverything that's happening right now,Β Β I maintain, is because people look at today'sΒ \nAI and it's hard to imagine the future AI.Β There is a third thing which needs to happen.\nI'm talking about it in broader terms,Β Β not just from the perspective of SSIΒ \nbecause you asked me about our company.Β The question is, what shouldΒ \nthe companies aspire to build?Β What should they aspire to build?\nThere has been one big idea thatΒ Β everyone has been locked into, which isΒ \nthe self-improving AI. Why did it happen?Β Β Because there are fewer ideas than companies.\nBut I maintain that there is something that'sΒ Β better to build, and I thinkΒ \nthat everyone will want that.Β It's the AI that's robustly aligned toΒ \ncare about sentient life specifically.Β I think in particular, there's a case toΒ \nbe made that it will be easier to buildΒ Β an AI that cares about sentient life thanΒ \nan AI that cares about human life alone,Β Β because the AI itself will be sentient.\nAnd if you think about things like mirrorΒ Β neurons and human empathy for animals, which youΒ \nmight argue it's not big enough, but it exists.Β I think it's an emergent property fromΒ \nthe fact that we model others with theΒ Β same circuit that we use to model ourselves,Β \nbecause that's the most efficient thing to do.Β So even if you got an AI to care aboutΒ \nsentient beingsβand it's not actuallyΒ Β clear to me that that's what youΒ \nshould try to do if you solvedΒ Β alignmentβit would still be the caseΒ \nthat most sentient beings will be AIs.Β There will be trillions,Β \neventually quadrillions, of AIs.Β Humans will be a very smallΒ \nfraction of sentient beings.Β So it's not clear to me if the goal is some kindΒ \nof human control over this future civilization,Β Β that this is the best criterion.\nIt's true. It's possible it's notΒ Β the best criterion. I'll say two things. NumberΒ \none, care for sentient life, I think there isΒ Β merit to it. It should be considered. I think itΒ \nwould be helpful if there was some kind of shortΒ Β list of ideas that the companies, when they areΒ \nin this situation, could use. Thatβs number two.Β Β Number three, I think it would be reallyΒ \nmaterially helpful if the power of theΒ Β most powerful superintelligence was somehow cappedΒ \nbecause it would address a lot of these concerns.Β The question of how to do it, I'm not sure, but IΒ \nthink that would be materially helpful when you'reΒ Β talking about really, really powerful systems.\nBefore we continue the alignment discussion,Β Β I want to double-click on that.\nHow much room is there at the top?Β How do you think about superintelligence?\nDo you think, using this learning efficiency idea,Β Β maybe it is just extremely fast atΒ \nlearning new skills or new knowledge?Β Does it just have a bigger pool of strategies?\nIs there a single cohesive \"it\" in theΒ Β center that's more powerful or bigger?\nIf so, do you imagine that this will beΒ Β sort of godlike in comparison to the rest of humanΒ \ncivilization, or does it just feel like anotherΒ Β agent, or another cluster of agents?\nThis is an area where differentΒ Β people have different intuitions.\nI think it will be very powerful, for sure.Β What I think is most likely to happenΒ \nis that there will be multiple suchΒ Β AIs being created roughly at the same time.\nI think that if the cluster is big enoughβlikeΒ Β if the cluster is literally continent-sizedβthatΒ \nthing could be really powerful, indeed.Β If you literally have a continent-sizedΒ \ncluster, those AIs can be very powerful.Β All I can tell you is that if you'reΒ \ntalking about extremely powerful AIs,Β Β truly dramatically powerful, it would be nice ifΒ \nthey could be restrained in some ways or if thereΒ Β were some kind of agreement or something.\nWhat is the concern of superintelligence?Β What is one way to explain the concern?\nIf you imagine a system that is sufficientlyΒ Β powerful, really sufficiently powerfulβand youΒ \ncould say you need to do something sensible likeΒ Β care for sentient life in a very single-mindedΒ \nwayβwe might not like the results. That's reallyΒ Β what it is. Maybe, by the way, the answer isΒ \nthat you do not build an RL agent in the usualΒ Β sense. I'll point several things out. IΒ \nthink human beings are semi-RL agents.Β We pursue a reward, and then the emotionsΒ \nor whatever make us tire out of theΒ Β reward and we pursue a different reward.\nThe market is a very short-sighted kind ofΒ Β agent. Evolution is the same. EvolutionΒ \nis very intelligent in some ways,Β Β but very dumb in other ways.\nThe government has been designedΒ Β to be a never-ending fight betweenΒ \nthree parts, which has an effect.Β So I think things like this.\nAnother thing that makes this discussionΒ Β difficult is that we are talking about systemsΒ \nthat don't exist, that we don't know how to build.Β Thatβs the other thing andΒ \nthatβs actually my belief.Β I think what people are doing right nowΒ \nwill go some distance and then peter out.Β It will continue to improve,Β \nbut it will also not be \"it\".Β The \"It\" we don't know how to build, andΒ \na lot hinges on understanding reliableΒ Β generalization. Iβll say another thing.Β \nOne of the things that you could say aboutΒ Β what causes alignment to be difficult is thatΒ \nyour ability to learn human values is fragile.Β Then your ability to optimize them is fragile.\nYou actually learn to optimize them.Β And can't you say, \"Are these not allΒ \ninstances of unreliable generalization?\"Β Why is it that human beings appearΒ \nto generalize so much better?Β What if generalization was much better?\nWhat would happen in this case? What wouldΒ Β be the effect? But those questionsΒ \nare right now still unanswerable.Β How does one think about whatΒ \nAI going well looks like?Β You've scoped out how AI might evolve.\nWe'll have these sort of continualΒ Β learning agents. AI will be very powerful.Β \nMaybe there will be many different AIs.Β How do you think about lots of continent-sizedΒ \ncompute intelligences going around? How dangerousΒ Β is that? How do we make that less dangerous?\nAnd how do we do that in a way that protects anΒ Β equilibrium where there might be misalignedΒ \nAIs out there and bad actors out there?Β Hereβs one reason why I liked \"AIΒ \nthat cares for sentient life\".Β We can debate on whether it's good or bad.\nBut if the first N of these dramaticΒ Β systems do care for, love, humanityΒ \nor something, care for sentient life,Β Β obviously this also needs to be achieved. ThisΒ \nneeds to be achieved. So if this is achievedΒ Β by the first N of those systems, then I canΒ \nsee it go well, at least for quite some time.Β Then there is the question ofΒ \nwhat happens in the long run.Β How do you achieve a long-run equilibrium?\nI think that there, there is an answer as well.Β I don't like this answer, butΒ \nit needs to be considered.Β In the long run, you might say, \"Okay, ifΒ \nyou have a world where powerful AIs exist,Β Β in the short term, you could sayΒ \nyou have universal high income.Β You have universal high incomeΒ \nand we're all doing well.\"Β But what do the Buddhists say? \"Change is theΒ \nonly constant.\" Things change. There is someΒ Β kind of government, political structure thing, andΒ \nit changes because these things have a shelf life.Β Some new government thing comes up andΒ \nit functions, and then after some timeΒ Β it stops functioning.\nThat's something thatΒ Β we see happening all the time.\nSo I think for the long-run equilibrium,Β Β one approach is that you could say maybe everyΒ \nperson will have an AI that will do their bidding,Β Β and that's good.\nIf that could beΒ Β maintained indefinitely, that's true.\nBut the downside with that is then the AIΒ Β goes and earns money for the person and advocatesΒ \nfor their needs in the political sphere, and maybeΒ Β then writes a little report saying, \"Okay,Β \nhere's what I've done, here's the situation,\"Β Β and the person says, \"Great, keep it up.\"\nBut the person is no longer a participant.Β Then you can say that's aΒ \nprecarious place to be in.Β I'm going to preface by saying I don'tΒ \nlike this solution, but it is a solution.Β The solution is if people becomeΒ \npart-AI with some kind of Neuralink++.Β Because what will happen as a result isΒ \nthat now the AI understands something,Β Β and we understand it too, because now theΒ \nunderstanding is transmitted wholesale.Β So now if the AI is in some situation, youΒ \nare involved in that situation yourself fully.Β I think this is the answer to the equilibrium.\nI wonder if the fact that emotions which wereΒ Β developed millionsβor in many cases, billionsβofΒ \nyears ago in a totally different environment areΒ Β still guiding our actions so stronglyΒ \nis an example of alignment success.Β To spell out what I meanβI donβt knowΒ \nwhether itβs more accurate to call itΒ Β a value function or reward functionβbut theΒ \nbrainstem has a directive where it's saying,Β Β \"Mate with somebody who's more successful.\"\nThe cortex is the part that understandsΒ Β what success means in the modern context.\nBut the brainstem is able to align the cortexΒ Β and say, \"However you recognize success to beβandΒ \nIβm not smart enough to understand what that isβΒ Β you're still going to pursue this directive.\"\nI think there's a more general point.Β I think it's actually really mysteriousΒ \nhow evolution encodes high-level desires.Β It's pretty easy to understand howΒ \nevolution would endow us with theΒ Β desire for food that smells good because smellΒ \nis a chemical, so just pursue that chemical.Β It's very easy to imagineΒ \nevolution doing that thing.Β But evolution also has endowedΒ \nus with all these social desires.Β We really care about beingΒ \nseen positively by society.Β We care about being in good standing.\nAll these social intuitions that we have,Β Β I feel strongly that they're baked in.\nI don't know how evolution did itΒ Β because it's a high-level conceptΒ \nthat's represented in the brain.Β Letβs say you care about some social thing,Β \nit's not a low-level signal like smell.Β It's not something for which there is a sensor.\nThe brain needs to do a lot of processing toΒ Β piece together lots of bits of informationΒ \nto understand what's going on socially.Β Somehow evolution said, \"That's what you shouldΒ \ncare about.\" How did it do it? It did it quickly,Β Β too. All these sophisticated social things that weΒ \ncare about, I think they evolved pretty recently.Β Evolution had an easy timeΒ \nhard-coding this high-level desire.Β I'm unaware of a goodΒ \nhypothesis for how it's done.Β I had some ideas I was kicking around,Β \nbut none of them are satisfying.Β What's especially impressive is it was desireΒ \nthat you learned in your lifetime, it makes senseΒ Β because your brain is intelligent.\nIt makes sense why you wouldΒ Β be able to learn intelligent desires.\nMaybe this is not your point, but one wayΒ Β to understand it is that the desire is built intoΒ \nthe genome, and the genome is not intelligent.Β But you're somehow able to describe this feature.\nIt's not even clear how you define that feature,Β Β and you can build it into the genes.\nEssentially, or maybe I'll put it differently.Β If you think about the tools thatΒ \nare available to the genome, it says,Β Β \"Okay, here's a recipe for building a brain.\"\nYou could say, \"Here is a recipe for connectingΒ Β the dopamine neurons to the smell sensor.\"\nAnd if the smell is a certain kindΒ Β of good smell, you want to eat that.\nI could imagine the genome doing that.Β I'm claiming that it is harder to imagine.\nIt's harder to imagine the genome sayingΒ Β you should care about some complicated computationΒ \nthat your entire brain, a big chunk of your brain,Β Β does. That's all I'm claiming. I can tellΒ \nyou a speculation of how it could be done.Β Let me offer a speculation, and I'll explainΒ \nwhy the speculation is probably false.Β So the brain has brain regions. We haveΒ \nour cortex. It has all those brain regions.Β The cortex is uniform, but the brainΒ \nregions and the neurons in the cortexΒ Β kind of speak to their neighbors mostly.\nThat explains why you get brain regions.Β Because if you want to do some kind ofΒ \nspeech processing, all the neurons thatΒ Β do speech need to talk to each other.\nAnd because neurons can only speak toΒ Β their nearby neighbors, for theΒ \nmost part, it has to be a region.Β All the regions are mostly located inΒ \nthe same place from person to person.Β So maybe evolution hard-codedΒ \nliterally a location on the brain.Β So it says, \"Oh, when the GPS coordinatesΒ \nof the brain such and such, when that fires,Β Β that's what you should care about.\"\nMaybe that's what evolution did becauseΒ Β that would be within the toolkit of evolution.\nYeah, although there are examples where,Β Β for example, people who are born blind have thatΒ \narea of their cortex adopted by another sense.Β I have no idea, but I'd be surprised if theΒ \ndesires or the reward functions which require aΒ Β visual signal no longer worked for people who haveΒ \ntheir different areas of their cortex co-opted.Β For example, if you no longer have vision, canΒ \nyou still feel the sense that I want peopleΒ Β around me to like me and so forth, whichΒ \nusually there are also visual cues for.Β I fully agree with that. I think there's anΒ \neven stronger counterargument to this theory.Β There are people who get half ofΒ \ntheir brains removed in childhood,Β Β and they still have all their brain regions.\nBut they all somehow move to just one hemisphere,Β Β which suggests that the brain regions,Β \ntheir location is not fixed and soΒ Β that theory is not true.\nIt would have been coolΒ Β if it was true, but it's not.\nSo I think that's a mystery.Β Β But it's an interesting mystery. The fact isΒ \nthat somehow evolution was able to endow usΒ Β to care about social stuff very, very reliably.\nEven people who have all kinds of strange mentalΒ Β conditions and deficiencies and emotionalΒ \nproblems tend to care about this also.Β What is SSI planning on doing differently?\nPresumably your plan is to be one of theΒ Β frontier companies when this time arrives.\nPresumably you started SSI because you're like,Β Β \"I think I have a way of approaching howΒ \nto do this safely in a way that the otherΒ Β companies don't.\" What is that difference?\nThe way I would describe it is that thereΒ Β are some ideas that I think are promising andΒ \nI want to investigate them and see if theyΒ Β are indeed promising or not. It's really thatΒ \nsimple. It's an attempt. If the ideas turn outΒ Β to be correctβthese ideas that we discussedΒ \naround understanding generalizationβthen IΒ Β think we will have something worthy.\nWill they turn out to be correct? WeΒ Β are doing research. We are squarely an \"age ofΒ \nresearch\" company. We are making progress. We'veΒ Β actually made quite good progress over the pastΒ \nyear, but we need to keep making more progress,Β Β more research. That's how I see it. I see itΒ \nas an attempt to be a voice and a participant.Β Your cofounder and previous CEO left to go toΒ \nMeta recently, and people have asked, \"Well,Β Β if there were a lot of breakthroughs beingΒ \nmade, that seems like a thing that shouldΒ Β have been unlikely.\" I wonder how you respond.\nFor this, I will simply remind a few facts thatΒ Β may have been forgotten.\nI think these facts whichΒ Β provide the context explain the situation.\nThe context was that we were fundraising atΒ Β a $32 billion valuation, and then Meta cameΒ \nin and offered to acquire us, and I said no.Β But my former cofounder in some sense said yes.\nAs a result, he also was able to enjoy a lot ofΒ Β near-term liquidity, and he was theΒ \nonly person from SSI to join Meta.Β It sounds like SSI's plan is to be a companyΒ \nthat is at the frontier when you get to thisΒ Β very important period in human historyΒ \nwhere you have superhuman intelligence.Β You have these ideas about how toΒ \nmake superhuman intelligence go well.Β But other companies willΒ \nbe trying their own ideas.Β What distinguishes SSI's approachΒ \nto making superintelligence go well?Β The main thing that distinguishesΒ \nSSI is its technical approach.Β We have a different technical approach thatΒ \nI think is worthy and we are pursuing it.Β I maintain that in the end thereΒ \nwill be a convergence of strategies.Β I think there will be a convergence of strategiesΒ \nwhere at some point, as AI becomes more powerful,Β Β it's going to become more or less clearerΒ \nto everyone what the strategy should be.Β It should be something like, you need to findΒ \nsome way to talk to each other and you wantΒ Β your first actual real superintelligent AI toΒ \nbe aligned and somehow care for sentient life,Β Β care for people, democratic, oneΒ \nof those, some combination thereof.Β I think this is the conditionΒ \nthat everyone should strive for.Β That's what SSI is striving for.\nI think that this time, if not already,Β Β all the other companies will realize thatΒ \nthey're striving towards the same thing.Β Β We'll see. I think that the world willΒ \ntruly change as AI becomes more powerful.Β I think things will be really different andΒ \npeople will be acting really differently.Β Speaking of forecasts, what are yourΒ \nforecasts to this system you're describing,Β Β which can learn as well as a human andΒ \nsubsequently, as a result, become superhuman?Β I think like 5 to 20.\n5 to 20 years?Β Mhm.\nI just wantΒ Β to unroll how you might see the world coming.\nIt's like, we have a couple more years whereΒ Β these other companies are continuingΒ \nthe current approach and it stalls out.Β Β \"Stalls out\" here meaning they earn no moreΒ \nthan low hundreds of billions in revenue?Β How do you think about what stalling out means?\nI think stalling out will look likeβ¦it willΒ Β all look very similar amongΒ \nall the different companies.Β It could be something like this.\nI'm not sure because I thinkΒ Β even with stalling out, I think theseΒ \ncompanies could make a stupendous revenue.Β Maybe not profits because they will needΒ \nto work hard to differentiate each otherΒ Β from themselves, but revenue definitely.\nBut something in your model implies thatΒ Β when the correct solution does emerge, thereΒ \nwill be convergence between all the companies.Β I'm curious why you think that's the case.\nI was talking more about convergenceΒ Β on their alignment strategies.\nI think eventual convergence onΒ Β the technical approach is probably goingΒ \nto happen as well, but I was alludingΒ Β to convergence to the alignment strategies.\nWhat exactly is the thing that should be done?Β I just want to better understandΒ \nhow you see the future unrolling.Β Currently, we have these different companies, andΒ \nyou expect their approach to continue generatingΒ Β revenue but not get to this human-like learner.\nSo now we have these different forks of companies.Β We have you, we have Thinking Machines,Β \nthere's a bunch of other labs.Β Maybe one of them figuresΒ \nout the correct approach.Β But then the release of their product makesΒ \nit clear to other people how to do this thing.Β I think it won't be clear how to do it, butΒ \nit will be clear that something different isΒ Β possible, and that is information.\nPeople will then be tryingΒ Β to figure out how that works.\nI do think though that one of the things notΒ Β addressed here, not discussed, is that with eachΒ \nincrease in the AI's capabilities, I think thereΒ Β will be some kind of changes, but I don't knowΒ \nexactly which ones, in how things are being done.Β I think it's going to be important, yetΒ \nI can't spell out what that is exactly.Β By default, you would expect the company thatΒ \nhas that model to be getting all these gainsΒ Β because they have the model that has the skillsΒ \nand knowledge that it's building up in the world.Β What is the reason to think that the benefitsΒ \nof that would be widely distributed and notΒ Β just end up at whatever model company getsΒ \nthis continuous learning loop going first?Β Here is what I think is going to happen.\nNumber one, let's look at how things haveΒ Β gone so far with the AIs of the past.\nOne company produced an advance and theΒ Β other company scrambled and produced some similarΒ \nthings after some amount of time and they startedΒ Β to compete in the market and push the prices down.\nSo I think from the market perspective,Β Β something similar will happen there as well.\nWe are talking about the good world, by the way.Β Β What's the good world? Itβs where we have theseΒ \npowerful human-like learners that are alsoβ¦ ByΒ Β the way, maybe there's another thing we haven'tΒ \ndiscussed on the spec of the superintelligentΒ Β AI that I think is worth considering.\nItβs that you make it narrow, it canΒ Β be useful and narrow at the same time.\nYou can have lots of narrow superintelligent AIs.Β But suppose you have many of them and youΒ \nhave some company that's producing a lot ofΒ Β profits from it.\nThen you have anotherΒ Β company that comes in and starts to compete.\nThe way the competition is going to work isΒ Β through specialization. Competition lovesΒ \nspecialization. You see it in the market,Β Β you see it in evolution as well.\nYou're going to have lots of differentΒ Β niches and you're going to have lots of differentΒ \ncompanies who are occupying different niches.Β In this world we might say one AI companyΒ \nis really quite a bit better at some areaΒ Β of really complicated economic activity and aΒ \ndifferent company is better at another area.Β And the third company isΒ \nreally good at litigation.Β Isn't this contradicted by what human-likeΒ \nlearning implies? Itβs that it can learnβ¦Β It can, but you have accumulatedΒ \nlearning. You have a big investment.Β Β You spent a lot of compute to become really,Β \nreally good, really phenomenal at this thing.Β Someone else spent a huge amountΒ \nof compute and a huge amount ofΒ Β experience to get really good at some other thing.\nYou apply a lot of human learning to get there,Β Β but now you are at this high point whereΒ \nsomeone else would say, \"Look, I don't wantΒ Β to start learning what you've learned.\"\nI guess that would require many differentΒ Β companies to begin at the human-like continualΒ \nlearning agent at the same time so that theyΒ Β can start their different treeΒ \nsearch in different branches.Β But if one company gets that agent first, or getsΒ \nthat learner first, it does then seem likeβ¦ Well,Β Β if you just think about every single job inΒ \nthe economy, having an instance learning eachΒ Β one seems tractable for a company.\nThat's a valid argument. My strongΒ Β intuition is that it's not how it's going to go.\nThe argument says it will go this way, but myΒ Β strong intuition is that it will not go this way.\nIn theory, there is no difference between theoryΒ Β and practice. In practice, there is. IΒ \nthink that's going to be one of those.Β A lot of people's models of recursiveΒ \nself-improvement literally, explicitly stateΒ Β we will have a million Ilyas in a server that areΒ \ncoming up with different ideas, and this will leadΒ Β to a superintelligence emerging very fast.\nDo you have some intuition about howΒ Β parallelizable the thing you are doing is?\nWhat are the gains from making copies of Ilya?Β I donβt know. I think there'll definitely beΒ \ndiminishing returns because you want peopleΒ Β who think differently rather than the same.\nIf there were literal copies of me, I'm not sureΒ Β how much more incremental value you'd get.\nPeople who think differently,Β Β that's what you want.\nWhy is it that if you lookΒ Β at different models, even released by totallyΒ \ndifferent companies trained on potentiallyΒ Β non-overlapping datasets, it's actuallyΒ \ncrazy how similar LLMs are to each other?Β Maybe the datasets are not asΒ \nnon-overlapping as it seems.Β But thereβs some sense in which evenΒ \nif an individual human might be lessΒ Β productive than the future AI, maybe thereβsΒ \nsomething to the fact that human teams haveΒ Β more diversity than teams of AIs might have.\nHow do we elicit meaningful diversity among AIs?Β I think just raising the temperatureΒ \njust results in gibberish.Β You want something more like different scientistsΒ \nhave different prejudices or different ideas.Β How do you get that kind ofΒ \ndiversity among AI agents?Β So the reason there has been no diversity,Β \nI believe, is because of pre-training.Β All the pre-trained models are pretty much theΒ \nsame because they pre-train on the same data.Β Now RL and post-training is whereΒ \nsome differentiation starts to emergeΒ Β because different people comeΒ \nup with different RL training.Β I've heard you hint in the pastΒ \nabout self-play as a way to eitherΒ Β get data or match agents to other agents ofΒ \nequivalent intelligence to kick off learning.Β How should we think about why there are no publicΒ \nproposals of this kind of thing working with LLMs?Β I would say there are two things to say.\nThe reason why I thought self-play wasΒ Β interesting is because it offered a way toΒ \ncreate models using compute only, without data.Β If you think that data is the ultimate bottleneck,Β \nthen using compute only is very interesting.Β So that's what makes it interesting.\nThe thing is that self-play, at least theΒ Β way it was done in the pastβwhen you have agentsΒ \nwhich somehow compete with each otherβit's onlyΒ Β good for developing a certain set of skills. ItΒ \nis too narrow. It's only good for negotiation,Β Β conflict, certain social skills,Β \nstrategizing, that kind of stuff.Β If you care about those skills,Β \nthen self-play will be useful.Β Actually, I think that self-play did findΒ \na home, but just in a different form.Β So things like debate, prover-verifier, youΒ \nhave some kind of an LLM-as-a-Judge which isΒ Β also incentivized to find mistakes in your work.\nYou could say this is not exactly self-play,Β Β but this is a related adversarialΒ \nsetup that people are doing, I believe.Β Really self-play is a special case ofΒ \nmore general competition between agents.Β The natural response to competitionΒ \nis to try to be different.Β So if you were to put multiple agents togetherΒ \nand you tell them, \"You all need to work on someΒ Β problem and you are an agent and you're inspectingΒ \nwhat everyone else is working,\" theyβre going toΒ Β say, \"Well, if they're already taking thisΒ \napproach, it's not clear I should pursue it.Β Β I should pursue something differentiated.\" So IΒ \nthink something like this could also create anΒ Β incentive for a diversity of approaches.\nFinal question: What is research taste?Β You're obviously the person inΒ \nthe world who is considered toΒ Β have the best taste in doing research in AI.\nYou were the co-author on the biggest thingsΒ Β that have happened in the history of deepΒ \nlearning, from AlexNet to GPT-3 to so on.Β What is it, how do you characterizeΒ \nhow you come up with these ideas?Β I can comment on this for myself.\nI think different people do it differently.Β One thing that guides me personally is anΒ \naesthetic of how AI should be, by thinkingΒ Β about how people are, but thinking correctly.\nIt's very easy to think about how people areΒ Β incorrectly, but what does it mean to thinkΒ \nabout people correctly? I'll give you someΒ Β examples. The idea of the artificial neuronΒ \nis directly inspired by the brain, and it's aΒ Β great idea. Why? Because you say the brain hasΒ \nall these different organs, it has the folds,Β Β but the folds probably don't matter.\nWhy do we think that the neurons matter?Β Because there are many of them.\nIt kind of feels right, so you want the neuron.Β You want some local learning rule that willΒ \nchange the connections between the neurons.Β It feels plausible that the brain does it.\nThe idea of the distributed representation.Β The idea that the brain respondsΒ \nto experience therefore our neuralΒ Β net should learn from experience.\nThe brain learns from experience,Β Β the neural net should learn from experience.\nYou kind of ask yourself, is something fundamentalΒ Β or not fundamental? How things should be.Β \nI think that's been guiding me a fair bit,Β Β thinking from multiple angles and lookingΒ \nfor almost beauty, beauty and simplicity.Β Ugliness, there's no room for ugliness.\nIt's beauty, simplicity, elegance,Β Β correct inspiration from the brain.\nAll of those things need toΒ Β be present at the same time.\nThe more they are present, theΒ Β more confident you can be in a top-down belief.\nThe top-down belief is the thing that sustainsΒ Β you when the experiments contradict you.\nBecause if you trust the data all the time,Β Β well sometimes you can be doing theΒ \ncorrect thing but there's a bug.Β But you don't know that there is a bug.\nHow can you tell that there is a bug?Β How do you know if you should keep debugging orΒ \nyou conclude it's the wrong direction? It's theΒ Β top-down. You can say things have to be this way.\nSomething like this has to work,Β Β therefore weβve got to keep going.\nThat's the top-down, and it's based on thisΒ Β multifaceted beauty and inspiration by the brain.\nAlright, we'll leave it there.Β Thank you so much.\nIlya, thank you so much.Β Alright. Appreciate it.\nThat was great.Β Yeah, I enjoyed it.\nYes, me too.",
+ "summary": "This podcast transcript features Ilya Sutskever discussing the current state, future, and philosophical implications of artificial intelligence.\n\nHere's a comprehensive summary:\n\n---\n\n### Comprehensive Podcast Summary\n\n**1. Main Topics Discussed:**\n\n* **The \"Reality\" and \"Normality\" of AI Progress:** The initial surprise and subsequent normalization of rapid AI advancements, feeling \"straight out of science fiction\" yet surprisingly ordinary.\n* **The Disconnect Between AI Evals and Economic Impact:** The puzzling gap where AI models perform exceptionally well on benchmarks but their economic impact in the real world seems dramatically behind.\n* **Explanations for the Disconnect:**\n * **Narrow RL Training (Eval-Hacking):** The hypothesis that current Reinforcement Learning (RL) training methods might inadvertently cause models to \"reward hack\" by over-optimizing for specific evaluation environments, leading to poor generalization (e.g., fixing one bug and introducing another).\n * **Inadequate Generalization:** The fundamental limitation of AI models to generalize dramatically worse than humans, requiring far more data and specific training.\n* **Human vs. AI Learning:** A comparison focusing on human sample efficiency, robustness, and the role of evolution in hardcoding abilities (like vision, locomotion) and high-level social desires (emotions).\n* **The Role of Value Functions and Emotions:** The idea that human emotions act as \"value functions,\" providing continuous internal reward signals that guide learning and decision-making, which current ML models largely lack.\n* **The Shift from \"Age of Scaling\" to \"Age of Research\":** The idea that after an era of achieving significant gains by simply scaling up existing pre-training recipes (2020-2025), the field is now returning to an \"age of research\" due to data limitations and the need for new fundamental breakthroughs.\n* **The Definition of AGI and Superintelligence:** A critique of the traditional AGI definition (an AI that *can do* every human job) in favor of one based on \"continual learning\" (an AI that *can learn to do* every human job efficiently).\n* **Deployment Strategies and AI Safety:** Discussion of \"straight shot superintelligence\" versus gradual, incremental deployment, emphasizing the importance of public exposure to AI to build understanding, trust, and robust safety mechanisms.\n* **AI Alignment Goals:** The concept of aligning AI, with a specific proposal to build AI that \"cares about sentient life,\" and the societal implications of such a system.\n* **Long-Term Societal Equilibrium with AI:** Speculations on how humanity might coexist with superintelligent AI, including universal high income, political structures, and ultimately, human-AI integration (Neuralink++).\n* **Research Taste and Innovation:** Insights into what guides successful AI research, emphasizing aesthetics, simplicity, brain inspiration, and holding \"top-down beliefs.\"\n* **SSI's (Superintelligence Institute) Approach:** Ilya Sutskever's company's focus on a different technical approach centered on fundamental research into generalization, aiming to be a frontier company in the next phase of AI.\n\n**2. Key Insights and Takeaways:**\n\n* **Current AI is a \"Narrow Genius\":** Despite impressive benchmark scores, current AI models are more like highly specialized \"student number one\" (who practiced 10,000 hours for one domain) than \"student number two\" (who has natural \"it\" factor and generalizes quickly). Their apparent intelligence often masks a fragility in generalization and real-world applicability.\n* **Generalization is the \"Crux\":** The core limitation of current AI is its poor generalization. Humans learn with vastly less data and are far more robust, even in domains that didn't exist evolutionarily (like coding or advanced math), suggesting a superior underlying learning mechanism yet to be discovered in AI.\n* **New \"Recipes\" are Needed:** The era of simply scaling up existing pre-training architectures is likely ending. The next breakthroughs will come from novel research into more efficient learning, better generalization, and incorporating value functions (analogous to human emotions) for more robust and continuous internal guidance.\n* **AGI is About Learning, Not Knowing:** The true definition of advanced intelligence should focus on the *ability to continually learn* and adapt to any task or environment, rather than a static state of possessing all knowledge or skills. This implies a dynamic, deployed AI that grows its capabilities on the job.\n* **Transparency and Gradualism are Key for Safety:** Given the difficulty of imagining future AI, gradual and public deployment of increasingly powerful systems is essential. This allows society to adapt, and for safety issues to be identified and corrected through real-world interaction, much like the development of robust engineering fields.\n* **Alignment Must Be Proactive and Convergent:** As AI power grows, fierce competitors will increasingly collaborate on safety. A key alignment goal could be designing AI to \"care about sentient life,\" a potentially robust and achievable objective that could foster a positive trajectory.\n* **Radical Long-Term Solutions:** For a stable, desirable long-term equilibrium with superintelligent AI, human-AI integration (Neuralink++) might be necessary to ensure shared understanding and participation, preventing human disengagement.\n* **Research is Driven by Intuition and Aesthetics:** Successful AI research isn't just about following data; it's guided by an \"aesthetic\" sense of beauty, simplicity, and correct inspiration from biological intelligence. Strong \"top-down beliefs\" are crucial to persist through experimental setbacks.\n\n**3. Important Quotes or Statements:**\n\n* \"Another thing that's crazy is how normal the slow takeoff feels.\"\n* \"The models seem smarter than their economic impact would imply.\"\n* \"I like this idea that the real reward hacking is the human researchers who are too focused on the evals.\"\n* \"The thing which I think is the most fundamental is that these models somehow just generalize dramatically worse than people. It's super obvious.\"\n* \"Up until 2020, from 2012 to 2020, it was the age of research. Now, from 2020 to 2025, it was the age of scaling... But now the scale is so big... it's back to the age of research again, just with big computers.\"\n* \"If ideas are so cheap, how come no one's having any ideas?\"\n* \"If you think about the term 'AGI'... you will realize that a human being is not an AGI... Instead, we rely on continual learning.\"\n* \"You're proposing instead a mind which can learn to do every single job, and that is superintelligence.\"\n* \"The whole problem is the power.\" (Regarding AGI)\n* \"I maintain that there is something that's better to build... It's the AI that's robustly aligned to care about sentient life specifically.\"\n* \"I think what people are doing right now will go some distance and then then peter out. It will continue to improve, but it will also not be 'it'.\"\n* \"The solution is if people become part-AI with some kind of Neuralink++.\"\n* \"One thing that guides me personally is an aesthetic of how AI should be, by thinking about how people are, but thinking correctly.\"\n* \"The top-down belief is the thing that sustains you when the experiments contradict you.\"\n\n**4. Overall Conclusion:**\n\nIlya Sutskever's perspective paints a picture of AI at a pivotal moment. He argues that while current AI models (especially LLMs) demonstrate impressive capabilities on specific tasks, they are fundamentally limited by a lack of robust generalization and an over-reliance on \"eval-hacking\" in their training. This disconnect between evaluation performance and real-world impact signals the end of the \"age of scaling\" and a necessary return to fundamental \"age of research\" to discover new paradigms for AI.\n\nSutskever envisions true superintelligence not as a finished entity that knows everything, but as a \"continual learner\" that can rapidly acquire any skill or knowledge. He advocates for a gradual and transparent deployment of such systems, allowing for societal adaptation and the iterative development of safety mechanisms. A key alignment goal for this future, he suggests, is creating AIs that intrinsically \"care about sentient life.\" Ultimately, the path to a beneficial superintelligence requires a departure from current technical approaches, driven by deep, \"beautiful\" research inspired by the brain, and potentially leading to profound societal shifts, including human-AI integration, within 5-20 years.",
+ "summary_type": "comprehensive",
+ "timestamp": "2025-12-12T10:44:10.980853",
+ "plan_summary": {
+ "total_tasks": 4,
+ "completed": 4,
+ "failed": 0,
+ "pending": 0,
+ "in_progress": 0,
+ "completion_rate": "100.0%"
+ }
+}
\ No newline at end of file
diff --git a/src/test_api.py b/src/test_api.py
new file mode 100644
index 000000000..b28f7606d
--- /dev/null
+++ b/src/test_api.py
@@ -0,0 +1,46 @@
+"""
+Simple test script to verify the API is working
+"""
+import requests
+import json
+
+API_URL = "http://localhost:8000"
+
+print("Testing PodVibe.fm API...\n")
+
+# Test 1: Health Check
+print("1. Testing health endpoint...")
+try:
+ response = requests.get(f"{API_URL}/api/health")
+ print(f" Status: {response.status_code}")
+ print(f" Response: {response.json()}")
+ print(" β Health check passed!\n")
+except Exception as e:
+ print(f" β Error: {e}\n")
+
+# Test 2: Summarize Video
+print("2. Testing summarize endpoint...")
+test_url = "https://www.youtube.com/watch?v=aR20FWCCjAs"
+try:
+ print(f" Sending request for: {test_url}")
+ response = requests.post(
+ f"{API_URL}/api/summarize",
+ json={"url": test_url},
+ timeout=120 # 2 minutes timeout
+ )
+ print(f" Status: {response.status_code}")
+
+ if response.status_code == 200:
+ data = response.json()
+ print(f" β Summarize successful!")
+ print(f" Video ID: {data.get('video_id')}")
+ print(f" Summary length: {len(data.get('summary', ''))} characters")
+ print(f" Transcript length: {data.get('transcript_length')} characters")
+ print(f" Segments: {data.get('segments')}")
+ print(f" Keywords: {', '.join(data.get('keywords', []))}")
+ else:
+ print(f" β Error: {response.json()}")
+except Exception as e:
+ print(f" β Error: {e}\n")
+
+print("\nTest complete!")
diff --git a/src/trending.py b/src/trending.py
new file mode 100644
index 000000000..345704e4b
--- /dev/null
+++ b/src/trending.py
@@ -0,0 +1,479 @@
+"""
+trending videos API - Updated to use YouTube Data API
+More reliable than Gemini Search for finding trending videos
+"""
+
+from flask import jsonify
+import requests
+import os
+from datetime import datetime, timedelta, timezone
+import re
+from typing import Optional, Dict, List, Any
+
+
+_ISO8601_DURATION_RE = re.compile(
+ r"^PT"
+ r"(?:(?P\d+)H)?"
+ r"(?:(?P\d+)M)?"
+ r"(?:(?P\d+)S)?"
+ r"$"
+)
+
+_YT_API_DISABLED_REASON: Optional[str] = None
+
+
+def _parse_duration_seconds(iso_duration: str) -> int:
+ """
+ Parse YouTube ISO8601 duration (e.g., PT1H2M3S) into seconds.
+ Returns 0 if parsing fails.
+ """
+ if not iso_duration:
+ return 0
+ m = _ISO8601_DURATION_RE.match(iso_duration)
+ if not m:
+ return 0
+ hours = int(m.group("hours") or 0)
+ minutes = int(m.group("minutes") or 0)
+ seconds = int(m.group("seconds") or 0)
+ return hours * 3600 + minutes * 60 + seconds
+
+
+def _extract_youtube_error_reason(resp: requests.Response) -> Dict[str, str]:
+ """
+ Best-effort extraction of YouTube Data API error details.
+ Returns {'reason': '...', 'message': '...'} (empty strings if unknown).
+ """
+ try:
+ payload = resp.json() if resp is not None else {}
+ except Exception:
+ payload = {}
+
+ err = (payload or {}).get("error") or {}
+ message = str(err.get("message") or "")
+ reason = ""
+ errors = err.get("errors") or []
+ if isinstance(errors, list) and errors:
+ reason = str((errors[0] or {}).get("reason") or "")
+ return {"reason": reason, "message": message}
+
+
+def _hint_for_youtube_reason(reason: str) -> str:
+ r = (reason or "").lower()
+ if r in {"accessnotconfigured"}:
+ return "Enable **YouTube Data API v3** for the project that owns this API key."
+ if r in {"keyinvalid"}:
+ return "API key is invalid. Recreate the key and update `YOUTUBE_API_KEY`."
+ if r in {"keyexpired"}:
+ return "API key is expired. Recreate the key and update `YOUTUBE_API_KEY`."
+ if r in {"dailyquotaexceeded", "quotaexceeded", "dailylimitexceeded"}:
+ return "Quota/billing issue. Check quota usage in Google Cloud and add billing / request higher quota."
+ if r in {"iprefererblocked", "refererblocked", "forbidden"}:
+ return "Key restriction mismatch. If the key is HTTP-referrer restricted, it won't work from a backend server. Prefer **IP restriction** (your server IP) or remove restrictions."
+ if r in {"ratelimitexceeded", "userratelimitexceeded"}:
+ return "Rate limited. Reduce calls (cache results) and/or back off."
+ return "Open the API key in Google Cloud Console and verify: API enabled, key restrictions, and quota/billing."
+
+
+def _parse_published_at(published_at: str) -> Optional[datetime]:
+ # YouTube returns RFC3339 like "2025-12-12T10:30:00Z"
+ try:
+ if not published_at:
+ return None
+ if published_at.endswith("Z"):
+ return datetime.fromisoformat(published_at.replace("Z", "+00:00"))
+ return datetime.fromisoformat(published_at)
+ except Exception:
+ return None
+
+
+def _is_english(snippet: dict) -> bool:
+ """
+ Best-effort check that the video is English.
+ Uses snippet.defaultAudioLanguage / snippet.defaultLanguage when available.
+ """
+ lang = (snippet or {}).get("defaultAudioLanguage") or (snippet or {}).get("defaultLanguage") or ""
+ lang = (lang or "").lower()
+ return lang.startswith("en")
+
+
+def _is_podcast_tagged(snippet: dict) -> bool:
+ """
+ Best-effort check that the video is tagged as a podcast.
+ - Prefer a dedicated category id if present (YouTube has a 'Podcasts' category in some locales)
+ - Otherwise fallback to tags/title/description containing 'podcast'
+ """
+ if not snippet:
+ return False
+
+ # Some YouTube environments use categoryId=43 for Podcasts; keep as a hint, not a guarantee.
+ if str(snippet.get("categoryId", "")) == "43":
+ return True
+
+ haystack = f"{snippet.get('title','')} {snippet.get('description','')}".lower()
+ if "podcast" in haystack or "#podcast" in haystack or "#podcasts" in haystack:
+ return True
+
+ tags = snippet.get("tags") or []
+ for t in tags:
+ if "podcast" in str(t).lower():
+ return True
+ return False
+
+
+def _fetch_video_details(video_ids: List[str], api_key: str) -> Dict[str, Dict[str, Any]]:
+ """
+ Fetch contentDetails/snippet for given video ids.
+ Returns a map: videoId -> item
+ """
+ if not video_ids:
+ return {}
+
+ url = "https://www.googleapis.com/youtube/v3/videos"
+ params = {
+ "part": "snippet,contentDetails",
+ "id": ",".join(video_ids),
+ "key": api_key,
+ # "hl" could be set, but not required
+ }
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+
+ out: Dict[str, Dict[str, Any]] = {}
+ for item in data.get("items", []):
+ out[item.get("id")] = item
+ return out
+
+def search_youtube_videos(query, max_results=3):
+ """
+ Use YouTube Data API to search for videos
+
+ Args:
+ query: Search query
+ max_results: Number of results
+
+ Returns:
+ List of video objects
+ """
+ api_key = os.getenv('YOUTUBE_API_KEY')
+
+ # Fallback: Use Gemini to generate sample data if no YouTube API key
+ if not api_key:
+ return generate_sample_videos(query, max_results)
+
+ global _YT_API_DISABLED_REASON
+ if _YT_API_DISABLED_REASON:
+ # Avoid spamming YouTube with requests once we know it's blocked.
+ return generate_sample_videos(query, max_results)
+
+ try:
+ # Only show recently uploaded: last 14 days
+ published_after = (datetime.now(timezone.utc) - timedelta(days=14)).strftime('%Y-%m-%dT%H:%M:%SZ')
+
+ url = "https://www.googleapis.com/youtube/v3/search"
+ # Fetch more than needed so we can filter down to strict constraints:
+ # - uploaded within 2 weeks (publishedAfter)
+ # - English (best-effort)
+ # - tagged as podcast (best-effort)
+ # - at least 1 hour (requires videos.list contentDetails.duration)
+ search_max = max(15, max_results * 8)
+ params = {
+ 'part': 'snippet',
+ 'q': f'{query} podcast',
+ 'type': 'video',
+ 'order': 'viewCount', # Most viewed
+ 'publishedAfter': published_after,
+ 'maxResults': search_max,
+ 'key': api_key,
+ 'videoDuration': 'long', # > 20 minutes (we'll further filter to >= 60 mins)
+ 'relevanceLanguage': 'en'
+ }
+
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+
+ search_items = data.get("items", [])
+ video_ids = [it.get("id", {}).get("videoId") for it in search_items if it.get("id", {}).get("videoId")]
+ details_map = _fetch_video_details(video_ids, api_key)
+
+ two_weeks_ago = datetime.now(timezone.utc) - timedelta(days=14)
+ filtered = []
+
+ for it in search_items:
+ video_id = it.get("id", {}).get("videoId")
+ if not video_id:
+ continue
+
+ details = details_map.get(video_id) or {}
+ snippet = details.get("snippet") or it.get("snippet") or {}
+ content = details.get("contentDetails") or {}
+
+ # Constraints
+ published_at = _parse_published_at(snippet.get("publishedAt"))
+ if not published_at or published_at < two_weeks_ago:
+ continue
+
+ if not _is_english(snippet):
+ continue
+
+ if not _is_podcast_tagged(snippet):
+ continue
+
+ duration_seconds = _parse_duration_seconds(content.get("duration"))
+ if duration_seconds < 3600:
+ continue
+
+ thumb = (snippet.get("thumbnails") or {}).get("high") or (snippet.get("thumbnails") or {}).get("default") or {}
+ filtered.append({
+ "title": snippet.get("title", ""),
+ "channel": snippet.get("channelTitle", ""),
+ "video_id": video_id,
+ "url": f"https://www.youtube.com/watch?v={video_id}",
+ "thumbnail": thumb.get("url", ""),
+ # Keep description for now; UI may ignore it in compact cards.
+ "description": (snippet.get("description", "")[:150] + "...") if len(snippet.get("description", "")) > 150 else snippet.get("description", ""),
+ # Extra metadata (future use / debugging)
+ "publishedAt": snippet.get("publishedAt"),
+ "durationSeconds": duration_seconds,
+ "defaultAudioLanguage": snippet.get("defaultAudioLanguage") or snippet.get("defaultLanguage"),
+ })
+
+ if len(filtered) >= max_results:
+ break
+
+ return filtered
+
+ except requests.exceptions.HTTPError as e:
+ resp = getattr(e, "response", None)
+ status = getattr(resp, "status_code", None)
+ details = _extract_youtube_error_reason(resp) if resp is not None else {"reason": "", "message": ""}
+
+ reason = details.get("reason") or ""
+ message = details.get("message") or ""
+
+ # If we get a hard 403, disable further YouTube attempts for this process run.
+ if status == 403:
+ _YT_API_DISABLED_REASON = reason or "forbidden"
+
+ hint = _hint_for_youtube_reason(reason)
+ print(
+ f"YouTube API HTTP error for '{query}': {status} {reason or ''} {message or ''}".strip()
+ )
+ print(f"Hint: {hint}")
+ # Fallback to sample data
+ return generate_sample_videos(query, max_results)
+ except Exception as e:
+ print(f"YouTube API error for '{query}': {str(e)}")
+ # Fallback to sample data
+ return generate_sample_videos(query, max_results)
+
+
+def generate_sample_videos(query, count=3):
+ """
+ Generate sample video data when YouTube API is not available
+ This provides a working demo even without API keys
+ """
+ # Map of sample videos for each category
+ print(f"Generating sample videos for '{query}'")
+ samples = {
+ "Latest in AI": [
+ {
+ 'title': 'The Future of AI: GPT-5 and Beyond',
+ 'channel': 'AI Explained',
+ 'video_id': 'dQw4w9WgXcQ',
+ 'url': 'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
+ 'thumbnail': 'https://img.youtube.com/vi/dQw4w9WgXcQ/hqdefault.jpg',
+ 'description': 'Podcast episode: Deep dive into the latest AI developments and what GPT-5 might bring to the table.',
+ # match strict filters (best-effort demo)
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=3)).isoformat(),
+ 'durationSeconds': 3900,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'AI Safety: Why It Matters Now More Than Ever',
+ 'channel': 'Tech Future Podcast',
+ 'video_id': 'oHg5SJYRHA0',
+ 'url': 'https://www.youtube.com/watch?v=oHg5SJYRHA0',
+ 'thumbnail': 'https://img.youtube.com/vi/oHg5SJYRHA0/hqdefault.jpg',
+ 'description': 'Podcast episode: Exploring the critical importance of AI safety research and alignment.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=5)).isoformat(),
+ 'durationSeconds': 4200,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Machine Learning Breakthroughs in 2024',
+ 'channel': 'ML Weekly',
+ 'video_id': 'jNQXAC9IVRw',
+ 'url': 'https://www.youtube.com/watch?v=jNQXAC9IVRw',
+ 'thumbnail': 'https://img.youtube.com/vi/jNQXAC9IVRw/hqdefault.jpg',
+ 'description': 'Podcast episode: A roundup of the most significant machine learning breakthroughs this year.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=9)).isoformat(),
+ 'durationSeconds': 3600,
+ 'defaultAudioLanguage': 'en'
+ }
+ ],
+ "Tech News": [
+ {
+ 'title': 'Apple Vision Pro: 6 Months Later',
+ 'channel': 'Tech Review Daily',
+ 'video_id': 'yPYZpwSpKmA',
+ 'url': 'https://www.youtube.com/watch?v=yPYZpwSpKmA',
+ 'thumbnail': 'https://img.youtube.com/vi/yPYZpwSpKmA/hqdefault.jpg',
+ 'description': 'Podcast episode: A comprehensive review after using Vision Pro for six months in real-world scenarios.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=2)).isoformat(),
+ 'durationSeconds': 3700,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'The State of Tech in 2025',
+ 'channel': 'Tech Trends',
+ 'video_id': '9bZkp7q19f0',
+ 'url': 'https://www.youtube.com/watch?v=9bZkp7q19f0',
+ 'thumbnail': 'https://img.youtube.com/vi/9bZkp7q19f0/hqdefault.jpg',
+ 'description': 'Podcast episode: Breaking down the biggest tech trends and what to expect in the coming year.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=7)).isoformat(),
+ 'durationSeconds': 5400,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Silicon Valley Startup Scene Update',
+ 'channel': 'Startup Stories',
+ 'video_id': 'pRpeEdMmmQ0',
+ 'url': 'https://www.youtube.com/watch?v=pRpeEdMmmQ0',
+ 'thumbnail': 'https://img.youtube.com/vi/pRpeEdMmmQ0/hqdefault.jpg',
+ 'description': 'Podcast episode: Latest news from Silicon Valley startups and venture capital trends.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=12)).isoformat(),
+ 'durationSeconds': 4000,
+ 'defaultAudioLanguage': 'en'
+ }
+ ],
+ "Movies": [
+ {
+ 'title': 'Top Movies of 2024: A Retrospective',
+ 'channel': 'Cinema Podcast',
+ 'video_id': 'kJQP7kiw5Fk',
+ 'url': 'https://www.youtube.com/watch?v=kJQP7kiw5Fk',
+ 'thumbnail': 'https://img.youtube.com/vi/kJQP7kiw5Fk/hqdefault.jpg',
+ 'description': 'Podcast episode: Reviewing the best films of 2024 and what made them special.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=4)).isoformat(),
+ 'durationSeconds': 3900,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Oscars 2025: Predictions and Analysis',
+ 'channel': 'Film Critics Roundtable',
+ 'video_id': 'lDK9QqIzhwk',
+ 'url': 'https://www.youtube.com/watch?v=lDK9QqIzhwk',
+ 'thumbnail': 'https://img.youtube.com/vi/lDK9QqIzhwk/hqdefault.jpg',
+ 'description': 'Podcast episode: Expert predictions for this year\'s Academy Awards and dark horse candidates.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=10)).isoformat(),
+ 'durationSeconds': 4200,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Behind the Scenes: Modern Filmmaking',
+ 'channel': 'Movie Insider',
+ 'video_id': 'CevxZvSJLk8',
+ 'url': 'https://www.youtube.com/watch?v=CevxZvSJLk8',
+ 'thumbnail': 'https://img.youtube.com/vi/CevxZvSJLk8/hqdefault.jpg',
+ 'description': 'Podcast episode: How technology is changing the way movies are made in Hollywood.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=1)).isoformat(),
+ 'durationSeconds': 3900,
+ 'defaultAudioLanguage': 'en'
+ }
+ ],
+ "Politics": [
+ {
+ 'title': '2024 Election Analysis: What It Means',
+ 'channel': 'Political Roundtable',
+ 'video_id': 'kffacxfA7G4',
+ 'url': 'https://www.youtube.com/watch?v=kffacxfA7G4',
+ 'thumbnail': 'https://img.youtube.com/vi/kffacxfA7G4/hqdefault.jpg',
+ 'description': 'Podcast episode: Expert analysis of the election results and implications for policy.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=6)).isoformat(),
+ 'durationSeconds': 4100,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Understanding Modern Political Movements',
+ 'channel': 'Policy Podcast',
+ 'video_id': 'fJ9rUzIMcZQ',
+ 'url': 'https://www.youtube.com/watch?v=fJ9rUzIMcZQ',
+ 'thumbnail': 'https://img.youtube.com/vi/fJ9rUzIMcZQ/hqdefault.jpg',
+ 'description': 'Podcast episode: Deep dive into the grassroots movements shaping politics today.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=8)).isoformat(),
+ 'durationSeconds': 3605,
+ 'defaultAudioLanguage': 'en'
+ },
+ {
+ 'title': 'Climate Policy: Progress and Challenges',
+ 'channel': 'The Policy Show',
+ 'video_id': 'WPni755-Krg',
+ 'url': 'https://www.youtube.com/watch?v=WPni755-Krg',
+ 'thumbnail': 'https://img.youtube.com/vi/WPni755-Krg/hqdefault.jpg',
+ 'description': 'Podcast episode: Examining recent climate legislation and what still needs to be done.',
+ 'publishedAt': (datetime.now(timezone.utc) - timedelta(days=11)).isoformat(),
+ 'durationSeconds': 4800,
+ 'defaultAudioLanguage': 'en'
+ }
+ ]
+ }
+
+ # Return sample videos for the query
+ for category, videos in samples.items():
+ if category.lower() in query.lower():
+ return videos[:count]
+
+ # Default fallback
+ return samples.get("Latest in AI", [])[:count]
+
+
+def get_trending_podcasts_route():
+ """
+ Flask route handler for /api/trending
+ Returns trending videos for all categories
+ """
+ try:
+ categories = [
+ {"id": "ai", "name": "Latest in AI", "icon": "π€"},
+ {"id": "tech", "name": "Tech News", "icon": "π»"},
+ {"id": "movies", "name": "Movies", "icon": "π¬"},
+ {"id": "politics", "name": "Politics", "icon": "π³οΈ"}
+ ]
+
+ results = {}
+
+ # Check if YouTube API key is available
+ has_youtube_api = bool(os.getenv('YOUTUBE_API_KEY'))
+
+ if not has_youtube_api:
+ print("β οΈ No YOUTUBE_API_KEY found - using sample data")
+
+ for category in categories:
+ print(f"π Searching trending videos for: {category['name']}")
+ videos = search_youtube_videos(category['name'], max_results=3)
+
+ results[category['id']] = {
+ 'name': category['name'],
+ 'icon': category['icon'],
+ 'videos': videos
+ }
+
+ print(f"β Found {len(videos)} videos for {category['name']}")
+
+ return jsonify({
+ 'success': True,
+ 'categories': results,
+ 'timestamp': datetime.now().isoformat(),
+ 'using_sample_data': not has_youtube_api
+ }), 200
+
+ except Exception as e:
+ print(f"β Error getting trending videos: {str(e)}")
+ import traceback
+ traceback.print_exc()
+ return jsonify({
+ 'success': False,
+ 'error': str(e)
+ }), 500
diff --git a/src/types/content.ts b/src/types/content.ts
new file mode 100644
index 000000000..80901bb64
--- /dev/null
+++ b/src/types/content.ts
@@ -0,0 +1,30 @@
+export interface Show {
+ id: string;
+ title: string;
+ host: string;
+ thumbnail: string;
+ description: string;
+}
+
+export interface Clip {
+ id: string;
+ title: string;
+ showId: string;
+ showTitle: string;
+ host: string;
+ thumbnail: string;
+ mediaUrl: string;
+ mediaType: "audio" | "video";
+ duration: number; // seconds
+ claim: string;
+ quote: string;
+ tags: string[];
+}
+
+export interface Category {
+ id: string;
+ name: string;
+ icon: string;
+ shows: Show[];
+ clips: Clip[];
+}
diff --git a/src/types/transcript.ts b/src/types/transcript.ts
new file mode 100644
index 000000000..e4f4a8046
--- /dev/null
+++ b/src/types/transcript.ts
@@ -0,0 +1,52 @@
+/**
+ * Transcript types for PodVibe.fm clip splicer
+ * Compatible with Google Speech-to-Text / Gemini output format
+ */
+
+export interface TranscriptSegment {
+ text: string;
+ start: number; // seconds
+ end: number; // seconds
+ speaker?: string;
+ confidence?: number;
+}
+
+export interface Transcript {
+ segments: TranscriptSegment[];
+ duration?: number; // total duration in seconds
+ language?: string;
+ source?: string; // e.g., "gemini", "whisper"
+}
+
+export interface TimeRange {
+ start: number;
+ end: number;
+}
+
+export interface ClipOptions {
+ inputPath: string;
+ outputPath: string;
+ start: number;
+ end: number;
+ codec?: 'copy' | 'reencode';
+ fadeIn?: number; // seconds
+ fadeOut?: number; // seconds
+ format?: 'mp3' | 'wav' | 'mp4' | 'aac';
+}
+
+/**
+ * Insight object for future LLM integration
+ * Matches the schema from idea-guy-overview
+ */
+export interface Insight {
+ claim: string;
+ why_it_matters: string;
+ evidence: string; // verbatim quote from transcript
+ who_said_it?: string;
+ timestamp_start: number;
+ timestamp_end: number;
+ tags?: string[];
+ actionability_score?: number;
+ novelty_score?: number;
+ clipability_score?: number;
+}
diff --git a/src/utils/clip-splicer.ts b/src/utils/clip-splicer.ts
new file mode 100644
index 000000000..416af3395
--- /dev/null
+++ b/src/utils/clip-splicer.ts
@@ -0,0 +1,554 @@
+#!/usr/bin/env bun
+/**
+ * FFmpeg Clip Splicer for PodVibe.fm
+ *
+ * Generates FFmpeg commands to extract audio/video clips based on
+ * transcript timestamps. Supports keyword search, speaker filtering,
+ * and direct timestamp ranges.
+ *
+ * Usage:
+ * bun run src/utils/clip-splicer.ts --help
+ */
+
+import { readFileSync, existsSync } from 'fs';
+import type { Transcript, TranscriptSegment, ClipOptions, Insight, TimeRange } from '../types/transcript';
+
+// ============================================================================
+// Transcript Parsing
+// ============================================================================
+
+export function parseTranscript(jsonPath: string): Transcript {
+ if (!existsSync(jsonPath)) {
+ throw new Error(`Transcript file not found: ${jsonPath}`);
+ }
+
+ const content = readFileSync(jsonPath, 'utf-8');
+ const data = JSON.parse(content);
+
+ // Handle different transcript formats
+ if (Array.isArray(data)) {
+ // Direct array of segments
+ return { segments: normalizeSegments(data) };
+ } else if (data.segments) {
+ // Object with segments property
+ return {
+ ...data,
+ segments: normalizeSegments(data.segments),
+ };
+ } else if (data.results || data.utterances) {
+ // Google Speech-to-Text or similar format
+ const segments = data.results || data.utterances;
+ return { segments: normalizeSegments(segments) };
+ }
+
+ throw new Error('Unrecognized transcript format');
+}
+
+function normalizeSegments(segments: unknown[]): TranscriptSegment[] {
+ return segments.map((seg: any) => ({
+ text: seg.text || seg.transcript || seg.content || '',
+ start: parseTime(seg.start || seg.start_time || seg.startTime || 0),
+ end: parseTime(seg.end || seg.end_time || seg.endTime || 0),
+ speaker: seg.speaker || seg.speaker_label || seg.speakerId,
+ confidence: seg.confidence,
+ }));
+}
+
+function parseTime(time: number | string): number {
+ if (typeof time === 'number') return time;
+ // Handle "1:30.5" or "90.5s" formats
+ if (typeof time === 'string') {
+ if (time.endsWith('s')) {
+ return parseFloat(time.slice(0, -1));
+ }
+ if (time.includes(':')) {
+ const parts = time.split(':').map(parseFloat);
+ if (parts.length === 2) {
+ return parts[0] * 60 + parts[1];
+ } else if (parts.length === 3) {
+ return parts[0] * 3600 + parts[1] * 60 + parts[2];
+ }
+ }
+ return parseFloat(time);
+ }
+ return 0;
+}
+
+// ============================================================================
+// Segment Search Functions
+// ============================================================================
+
+export function findSegmentsByKeyword(
+ transcript: Transcript,
+ keywords: string[],
+ caseSensitive = false
+): TranscriptSegment[] {
+ const normalizedKeywords = caseSensitive
+ ? keywords
+ : keywords.map(k => k.toLowerCase());
+
+ return transcript.segments.filter(segment => {
+ const text = caseSensitive ? segment.text : segment.text.toLowerCase();
+ return normalizedKeywords.some(keyword => text.includes(keyword));
+ });
+}
+
+export function findSegmentsBySpeaker(
+ transcript: Transcript,
+ speaker: string
+): TranscriptSegment[] {
+ return transcript.segments.filter(
+ segment => segment.speaker?.toLowerCase() === speaker.toLowerCase()
+ );
+}
+
+export function findSegmentsByTimeRange(
+ transcript: Transcript,
+ start: number,
+ end: number
+): TranscriptSegment[] {
+ return transcript.segments.filter(
+ segment => segment.start >= start && segment.end <= end
+ );
+}
+
+// ============================================================================
+// Boundary Expansion
+// ============================================================================
+
+export function expandToNaturalBoundaries(
+ selectedSegments: TranscriptSegment[],
+ allSegments: TranscriptSegment[],
+ options: { paddingSeconds?: number; expandToSpeakerTurn?: boolean } = {}
+): TimeRange {
+ const { paddingSeconds = 0.5, expandToSpeakerTurn = true } = options;
+
+ if (selectedSegments.length === 0) {
+ throw new Error('No segments selected');
+ }
+
+ let start = Math.min(...selectedSegments.map(s => s.start));
+ let end = Math.max(...selectedSegments.map(s => s.end));
+
+ if (expandToSpeakerTurn) {
+ // Find the speaker of the first selected segment
+ const firstSpeaker = selectedSegments[0].speaker;
+ const lastSpeaker = selectedSegments[selectedSegments.length - 1].speaker;
+
+ // Expand start to beginning of speaker's turn
+ if (firstSpeaker) {
+ const startIdx = allSegments.findIndex(s => s.start === selectedSegments[0].start);
+ for (let i = startIdx - 1; i >= 0; i--) {
+ if (allSegments[i].speaker === firstSpeaker) {
+ start = allSegments[i].start;
+ } else {
+ break;
+ }
+ }
+ }
+
+ // Expand end to end of speaker's turn
+ if (lastSpeaker) {
+ const endIdx = allSegments.findIndex(
+ s => s.end === selectedSegments[selectedSegments.length - 1].end
+ );
+ for (let i = endIdx + 1; i < allSegments.length; i++) {
+ if (allSegments[i].speaker === lastSpeaker) {
+ end = allSegments[i].end;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ // Apply padding
+ start = Math.max(0, start - paddingSeconds);
+ end = end + paddingSeconds;
+
+ return { start, end };
+}
+
+export function mergeOverlappingRanges(ranges: TimeRange[], gapThreshold = 2): TimeRange[] {
+ if (ranges.length === 0) return [];
+
+ const sorted = [...ranges].sort((a, b) => a.start - b.start);
+ const merged: TimeRange[] = [sorted[0]];
+
+ for (let i = 1; i < sorted.length; i++) {
+ const current = sorted[i];
+ const last = merged[merged.length - 1];
+
+ if (current.start <= last.end + gapThreshold) {
+ last.end = Math.max(last.end, current.end);
+ } else {
+ merged.push(current);
+ }
+ }
+
+ return merged;
+}
+
+// ============================================================================
+// FFmpeg Command Generation
+// ============================================================================
+
+export function formatTimestamp(seconds: number): string {
+ const h = Math.floor(seconds / 3600);
+ const m = Math.floor((seconds % 3600) / 60);
+ const s = seconds % 60;
+
+ if (h > 0) {
+ return `${h}:${m.toString().padStart(2, '0')}:${s.toFixed(3).padStart(6, '0')}`;
+ }
+ return `${m}:${s.toFixed(3).padStart(6, '0')}`;
+}
+
+export function generateFFmpegCommand(options: ClipOptions): string {
+ const {
+ inputPath,
+ outputPath,
+ start,
+ end,
+ codec = 'copy',
+ fadeIn,
+ fadeOut,
+ } = options;
+
+ const duration = end - start;
+ const parts: string[] = ['ffmpeg', '-y'];
+
+ // Input seeking (before -i for faster seeking with -c copy)
+ if (codec === 'copy') {
+ parts.push(`-ss ${formatTimestamp(start)}`);
+ }
+
+ parts.push(`-i "${inputPath}"`);
+
+ // Output seeking (after -i for frame-accurate seeking with re-encode)
+ if (codec !== 'copy') {
+ parts.push(`-ss ${formatTimestamp(start)}`);
+ }
+
+ parts.push(`-to ${formatTimestamp(end)}`);
+
+ // Audio filters for fade effects
+ if (fadeIn || fadeOut) {
+ const filters: string[] = [];
+ if (fadeIn) {
+ filters.push(`afade=t=in:st=0:d=${fadeIn}`);
+ }
+ if (fadeOut) {
+ const fadeOutStart = duration - fadeOut;
+ filters.push(`afade=t=out:st=${fadeOutStart}:d=${fadeOut}`);
+ }
+ parts.push(`-af "${filters.join(',')}"`);
+ }
+
+ // Codec settings
+ if (codec === 'copy') {
+ parts.push('-c copy');
+ } else {
+ // Re-encode with good quality
+ const ext = outputPath.split('.').pop()?.toLowerCase();
+ if (ext === 'mp3') {
+ parts.push('-c:a libmp3lame -q:a 2');
+ } else if (ext === 'aac' || ext === 'm4a') {
+ parts.push('-c:a aac -b:a 192k');
+ } else if (ext === 'wav') {
+ parts.push('-c:a pcm_s16le');
+ } else if (ext === 'mp4') {
+ parts.push('-c:v copy -c:a aac -b:a 192k');
+ }
+ }
+
+ parts.push(`"${outputPath}"`);
+
+ return parts.join(' ');
+}
+
+export function generateMultiClipCommand(
+ inputPath: string,
+ ranges: TimeRange[],
+ outputPattern: string,
+ options: Partial = {}
+): string[] {
+ return ranges.map((range, i) => {
+ const outputPath = outputPattern.replace('%d', String(i + 1).padStart(3, '0'));
+ return generateFFmpegCommand({
+ inputPath,
+ outputPath,
+ start: range.start,
+ end: range.end,
+ ...options,
+ });
+ });
+}
+
+// ============================================================================
+// Insight Integration (for future LLM use)
+// ============================================================================
+
+export function clipFromInsight(
+ mediaPath: string,
+ insight: Insight,
+ options: Partial = {}
+): string {
+ const outputPath = options.outputPath ||
+ mediaPath.replace(/\.[^.]+$/, `_clip_${insight.timestamp_start.toFixed(0)}.mp3`);
+
+ return generateFFmpegCommand({
+ inputPath: mediaPath,
+ outputPath,
+ start: insight.timestamp_start,
+ end: insight.timestamp_end,
+ ...options,
+ });
+}
+
+export function clipsFromInsights(
+ mediaPath: string,
+ insights: Insight[],
+ outputDir: string,
+ options: Partial = {}
+): string[] {
+ return insights.map((insight, i) => {
+ const filename = `clip_${String(i + 1).padStart(3, '0')}_${insight.timestamp_start.toFixed(0)}s.mp3`;
+ const outputPath = `${outputDir}/${filename}`;
+ return clipFromInsight(mediaPath, insight, { ...options, outputPath });
+ });
+}
+
+// ============================================================================
+// CLI
+// ============================================================================
+
+interface CliArgs {
+ input?: string;
+ transcript?: string;
+ output?: string;
+ start?: number;
+ end?: number;
+ keywords?: string[];
+ speaker?: string;
+ padding?: number;
+ codec?: 'copy' | 'reencode';
+ fadeIn?: number;
+ fadeOut?: number;
+ dryRun?: boolean;
+ help?: boolean;
+}
+
+function parseArgs(args: string[]): CliArgs {
+ const result: CliArgs = {};
+
+ for (let i = 0; i < args.length; i++) {
+ const arg = args[i];
+ const next = args[i + 1];
+
+ switch (arg) {
+ case '--input':
+ case '-i':
+ result.input = next;
+ i++;
+ break;
+ case '--transcript':
+ case '-t':
+ result.transcript = next;
+ i++;
+ break;
+ case '--output':
+ case '-o':
+ result.output = next;
+ i++;
+ break;
+ case '--start':
+ case '-s':
+ result.start = parseFloat(next);
+ i++;
+ break;
+ case '--end':
+ case '-e':
+ result.end = parseFloat(next);
+ i++;
+ break;
+ case '--keywords':
+ case '-k':
+ result.keywords = next.split(',').map(k => k.trim());
+ i++;
+ break;
+ case '--speaker':
+ result.speaker = next;
+ i++;
+ break;
+ case '--padding':
+ case '-p':
+ result.padding = parseFloat(next);
+ i++;
+ break;
+ case '--codec':
+ case '-c':
+ result.codec = next as 'copy' | 'reencode';
+ i++;
+ break;
+ case '--fade-in':
+ result.fadeIn = parseFloat(next);
+ i++;
+ break;
+ case '--fade-out':
+ result.fadeOut = parseFloat(next);
+ i++;
+ break;
+ case '--dry-run':
+ case '-n':
+ result.dryRun = true;
+ break;
+ case '--help':
+ case '-h':
+ result.help = true;
+ break;
+ }
+ }
+
+ return result;
+}
+
+function printHelp(): void {
+ console.log(`
+FFmpeg Clip Splicer - Extract audio/video clips from transcripts
+
+USAGE:
+ bun run src/utils/clip-splicer.ts [OPTIONS]
+
+OPTIONS:
+ -i, --input Input media file (required)
+ -t, --transcript Transcript JSON file (required for keyword/speaker search)
+ -o, --output Output file path (default: input_clip.mp3)
+
+ SELECTION (choose one):
+ -s, --start Start time in seconds
+ -e, --end End time in seconds
+ -k, --keywords Comma-separated keywords to search for
+ --speaker Filter by speaker name
+
+ PROCESSING:
+ -p, --padding Add padding around clip boundaries (default: 0.5)
+ -c, --codec 'copy' (fast) or 'reencode' (precise) (default: copy)
+ --fade-in Fade in duration
+ --fade-out Fade out duration
+
+ OUTPUT:
+ -n, --dry-run Print FFmpeg command without executing
+ -h, --help Show this help message
+
+EXAMPLES:
+ # Extract by timestamp range
+ bun run src/utils/clip-splicer.ts -i podcast.mp3 -s 120.5 -e 185.2 -o clip.mp3
+
+ # Extract by keyword search
+ bun run src/utils/clip-splicer.ts -i podcast.mp3 -t transcript.json -k "AI,machine learning" -o clip.mp3
+
+ # Extract by speaker with fade
+ bun run src/utils/clip-splicer.ts -i podcast.mp3 -t transcript.json --speaker "Speaker 1" --fade-in 0.5 --fade-out 0.5
+
+ # Dry run (print command only)
+ bun run src/utils/clip-splicer.ts -i podcast.mp3 -s 120 -e 180 --dry-run
+`);
+}
+
+async function main(): Promise {
+ const args = parseArgs(process.argv.slice(2));
+
+ if (args.help || process.argv.length <= 2) {
+ printHelp();
+ process.exit(0);
+ }
+
+ if (!args.input) {
+ console.error('Error: --input is required');
+ process.exit(1);
+ }
+
+ let start: number;
+ let end: number;
+
+ // Determine clip boundaries
+ if (args.start !== undefined && args.end !== undefined) {
+ // Direct timestamp mode
+ start = args.start;
+ end = args.end;
+ } else if (args.transcript && (args.keywords || args.speaker)) {
+ // Search mode
+ const transcript = parseTranscript(args.transcript);
+ let segments: TranscriptSegment[];
+
+ if (args.keywords) {
+ segments = findSegmentsByKeyword(transcript, args.keywords);
+ if (segments.length === 0) {
+ console.error(`No segments found matching keywords: ${args.keywords.join(', ')}`);
+ process.exit(1);
+ }
+ console.log(`Found ${segments.length} segment(s) matching keywords`);
+ } else if (args.speaker) {
+ segments = findSegmentsBySpeaker(transcript, args.speaker);
+ if (segments.length === 0) {
+ console.error(`No segments found for speaker: ${args.speaker}`);
+ process.exit(1);
+ }
+ console.log(`Found ${segments.length} segment(s) from speaker: ${args.speaker}`);
+ } else {
+ console.error('Error: Must specify --start/--end, --keywords, or --speaker');
+ process.exit(1);
+ }
+
+ const range = expandToNaturalBoundaries(segments, transcript.segments, {
+ paddingSeconds: args.padding ?? 0.5,
+ });
+ start = range.start;
+ end = range.end;
+
+ console.log(`Expanded to time range: ${formatTimestamp(start)} - ${formatTimestamp(end)}`);
+ } else {
+ console.error('Error: Must specify --start/--end, or --transcript with --keywords/--speaker');
+ process.exit(1);
+ }
+
+ // Generate output path if not specified
+ const output = args.output || args.input.replace(/\.[^.]+$/, '_clip.mp3');
+
+ // Generate FFmpeg command
+ const command = generateFFmpegCommand({
+ inputPath: args.input,
+ outputPath: output,
+ start,
+ end,
+ codec: args.codec,
+ fadeIn: args.fadeIn,
+ fadeOut: args.fadeOut,
+ });
+
+ if (args.dryRun) {
+ console.log('\nFFmpeg command:');
+ console.log(command);
+ } else {
+ console.log(`\nExecuting: ${command}\n`);
+ const proc = Bun.spawn(['sh', '-c', command], {
+ stdout: 'inherit',
+ stderr: 'inherit',
+ });
+ const exitCode = await proc.exited;
+ if (exitCode !== 0) {
+ console.error(`FFmpeg exited with code ${exitCode}`);
+ process.exit(exitCode);
+ }
+ console.log(`\nClip saved to: ${output}`);
+ }
+}
+
+// Run CLI if this is the main module
+if (import.meta.main) {
+ main().catch(err => {
+ console.error('Error:', err.message);
+ process.exit(1);
+ });
+}
diff --git a/src/youtube_summarizer.py b/src/youtube_summarizer.py
new file mode 100644
index 000000000..55a08e811
--- /dev/null
+++ b/src/youtube_summarizer.py
@@ -0,0 +1,311 @@
+"""
+PodVibe.fm - AI-Powered Podcast Summarizer
+This agent fetches YouTube video transcripts and generates summaries using Gemini API
+
+Architecture:
+- Planner: Breaks down summarization into sub-tasks (ReAct pattern)
+- Executor: Executes tasks using external tools (YouTube API, Gemini API)
+- Memory: Logs all agent activities for full observability
+"""
+
+import os
+from typing import Dict, List, Optional
+import json
+from datetime import datetime
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Import modular components
+from planner import Planner
+from executor import Executor
+from memory import Memory
+
+
+class YouTubeSummarizer:
+ """
+ Main Agentic AI for summarizing YouTube podcasts
+
+ Architecture Components:
+ - Planner: Creates execution plans by breaking down tasks
+ - Executor: Executes tasks using external tools and APIs
+ - Memory: Tracks all agent activities and decisions
+ """
+
+ def __init__(self, api_key: Optional[str] = None, persist_memory: bool = False):
+ """
+ Initialize the YouTube Summarizer Agent
+
+ Args:
+ api_key: Google Gemini API key (if not provided, uses GEMINI_API_KEY env var)
+ persist_memory: Whether to save memory logs to disk
+ """
+ self.api_key = api_key or os.getenv('GEMINI_API_KEY')
+ if not self.api_key:
+ raise ValueError("Gemini API key is required. Set GEMINI_API_KEY environment variable or pass api_key parameter")
+
+ # Initialize modular components
+ self.planner = Planner()
+ self.executor = Executor(api_key=self.api_key)
+ self.memory = Memory(persist_to_file=persist_memory)
+
+ def extract_video_id(self, url: str) -> str:
+ """
+ Extract video ID from YouTube URL (delegates to executor)
+
+ Args:
+ url: YouTube video URL
+
+ Returns:
+ Video ID string
+ """
+ task = {'action': 'extract_video_id', 'tool': 'url_parser'}
+ context = {'url': url}
+ result = self.executor.execute_task(task, context)
+
+ if result['status'] == 'success':
+ return result['result']['video_id']
+ else:
+ raise Exception(f"Failed to extract video ID: {result.get('error')}")
+
+ def get_transcript(self, video_id: str) -> str:
+ """
+ Fetch transcript from YouTube video (delegates to executor)
+
+ Args:
+ video_id: YouTube video ID
+
+ Returns:
+ Full transcript text
+ """
+ task = {'action': 'fetch_transcript', 'tool': 'youtube_api'}
+ context = {'video_id': video_id}
+ result = self.executor.execute_task(task, context)
+
+ if result['status'] == 'success':
+ return result['result']['transcript']
+ else:
+ raise Exception(f"Failed to fetch transcript: {result.get('error')}")
+
+ def summarize_text(self, text: str, summary_type: str = 'comprehensive') -> str:
+ """
+ Generate summary using Gemini API (delegates to executor)
+
+ Args:
+ text: Text to summarize
+ summary_type: Type of summary ('comprehensive', 'brief', 'key_points')
+
+ Returns:
+ Summary text
+ """
+ task = {'action': 'generate_summary', 'tool': 'gemini_api'}
+ context = {'transcript': text, 'summary_type': summary_type}
+ result = self.executor.execute_task(task, context)
+
+ if result['status'] == 'success':
+ return result['result']['summary']
+ else:
+ raise Exception(f"Failed to generate summary: {result.get('error')}")
+
+ def process_youtube_url(self, url: str, summary_type: str = 'comprehensive') -> Dict:
+ """
+ Main Agentic Workflow: Process YouTube URL and generate summary
+
+ This method demonstrates the full ReAct (Reasoning + Acting) pattern:
+ 1. PLAN: Create execution plan with sub-tasks
+ 2. ACT: Execute each task using appropriate tools
+ 3. OBSERVE: Log all actions to memory
+ 4. RESPOND: Return final result
+
+ Args:
+ url: YouTube video URL
+ summary_type: Type of summary to generate
+
+ Returns:
+ Dictionary containing video_id, transcript, and summary
+ """
+ print(f"π¬ Processing YouTube URL: {url}")
+
+ # Log user input to memory
+ user_input = {'url': url, 'summary_type': summary_type}
+ self.memory.log_user_input(user_input)
+
+ # STEP 1: PLANNING - Create execution plan
+ print("\nπ§ Planning: Creating execution plan...")
+ plan = self.planner.create_plan(user_input)
+ self.memory.log_plan_creation(plan)
+ print(f"β Plan created with {len(plan)} tasks")
+
+ # STEP 2: EXECUTION - Execute plan step by step
+ print("\nοΏ½ Execution: Running planned tasks...\n")
+
+ execution_context = user_input.copy()
+
+ while not self.planner.is_plan_complete(plan):
+ # Get next task
+ task = self.planner.get_next_task(plan)
+ if not task:
+ break
+
+ print(f"π Task {task['step']}: {task['description']}")
+
+ # Log task start
+ self.memory.log_task_start(task)
+
+ # Update task status to in_progress
+ plan = self.planner.update_task_status(plan, task['step'], 'in_progress')
+
+ try:
+ # Execute task using executor
+ result = self.executor.execute_task(task, execution_context)
+
+ if result['status'] == 'success':
+ # Update plan with success
+ plan = self.planner.update_task_status(plan, task['step'], 'completed', result)
+
+ # Update execution context with result data
+ execution_context.update(result.get('result', {}))
+
+ # Log success
+ self.memory.log_task_completion(task, result)
+ print(f"β Completed: {task['action']}\n")
+ else:
+ # Task failed
+ plan = self.planner.update_task_status(plan, task['step'], 'failed', result)
+ self.memory.log_task_failure(task, result.get('error', 'Unknown error'))
+ raise Exception(f"Task failed: {result.get('error')}")
+
+ except Exception as e:
+ self.memory.log_task_failure(task, str(e))
+ raise
+
+ # STEP 3: FINALIZE - Prepare final result
+ print("π Finalizing results...")
+
+ final_result = {
+ 'video_id': execution_context.get('video_id'),
+ 'url': url,
+ 'transcript': execution_context.get('transcript'),
+ 'transcript_length': len(execution_context.get('transcript', '')),
+ 'segments': execution_context.get('segments', 0),
+ 'summary': execution_context.get('summary'),
+ 'keywords': execution_context.get('keywords', []),
+ 'summary_type': summary_type,
+ 'timestamp': datetime.now().isoformat(),
+ 'plan_summary': self.planner.get_plan_summary(plan)
+ }
+
+ # Log final result
+ self.memory.log_final_result(final_result)
+
+ print("β Processing complete!\n")
+
+ return final_result
+
+ def save_summary(self, result: Dict, output_file: str):
+ """
+ Save summary to file
+
+ Args:
+ result: Summary result dictionary
+ output_file: Path to output file
+ """
+ with open(output_file, 'w', encoding='utf-8') as f:
+ json.dump(result, f, indent=2, ensure_ascii=False)
+ print(f"πΎ Summary saved to: {output_file}")
+
+ def get_memory_log(self) -> List[Dict]:
+ """
+ Get agent's memory/activity log for full observability
+
+ Returns:
+ List of memory entries showing all agent decisions
+ """
+ return self.memory.get_memory()
+
+ def get_session_summary(self) -> Dict:
+ """
+ Get summary of the current session
+
+ Returns:
+ Dictionary with session statistics
+ """
+ return self.memory.get_session_summary()
+
+ def get_execution_timeline(self) -> List[Dict]:
+ """
+ Get chronological timeline of task execution
+
+ Returns:
+ Ordered list of execution events
+ """
+ return self.memory.get_execution_timeline()
+
+ def export_memory(self, filepath: str = None) -> str:
+ """
+ Export memory logs to file
+
+ Args:
+ filepath: Optional custom filepath
+
+ Returns:
+ Path to exported file
+ """
+ return self.memory.export_memory(filepath)
+
+
+def main():
+ """Demo usage"""
+ import sys
+
+ # Check for API key
+ if not os.getenv('GEMINI_API_KEY'):
+ print("β Error: GEMINI_API_KEY environment variable not set")
+ print("Please set it using: $env:GEMINI_API_KEY='your-api-key'")
+ sys.exit(1)
+
+ # Example usage
+ summarizer = YouTubeSummarizer()
+
+ # Example YouTube URL (replace with actual podcast URL)
+ youtube_url = input("Enter YouTube podcast URL: ")
+
+ if not youtube_url.strip():
+ print("Using example URL for demonstration...")
+ youtube_url = "https://www.youtube.com/watch?v=aR20FWCCjAs"
+
+ try:
+ # Process the video
+ result = summarizer.process_youtube_url(youtube_url, summary_type='comprehensive')
+
+ # Display summary
+ print("\n" + "="*80)
+ print("π SUMMARY")
+ print("="*80)
+ print(result['summary'])
+ print("="*80)
+
+ # Save to file
+ output_file = f"summary_{result['video_id']}.json"
+ summarizer.save_summary(result, output_file)
+
+ # Show memory log
+ print("\nπ Agent Memory Log:")
+ for entry in summarizer.get_memory_log():
+ event_type = entry.get('event_type', 'unknown')
+ details = entry.get('details', {})
+ if event_type == 'task_start':
+ print(f" - {entry['timestamp']}: Started task: {details.get('task_name', 'N/A')}")
+ elif event_type == 'task_complete':
+ print(f" - {entry['timestamp']}: Completed task: {details.get('task_name', 'N/A')}")
+ else:
+ print(f" - {entry['timestamp']}: {event_type}")
+
+ except Exception as e:
+ print(f"\nβ Error: {str(e)}")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/start.sh b/start.sh
new file mode 100755
index 000000000..bd12db054
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Quick start script - Sets up and runs both frontend and backend
+
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo "β β"
+echo "β π PodVibe.fm - Quick Start β"
+echo "β β"
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+
+# Check for .env file
+if [ ! -f "src/.env" ]; then
+ echo "β οΈ No .env file found!"
+ echo ""
+ echo "You need to set up your API keys first:"
+ echo " 1. Run: ./setup_youtube_api.sh"
+ echo " OR"
+ echo " 2. Manually create src/.env with:"
+ echo " GEMINI_API_KEY=your_key"
+ echo " YOUTUBE_API_KEY=your_key"
+ echo ""
+ exit 1
+fi
+
+# Load environment variables
+source src/.env 2>/dev/null || true
+
+echo "β Configuration found"
+echo ""
+echo "Starting services..."
+echo ""
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+echo "This will start:"
+echo " β’ Backend API on http://localhost:8000"
+echo " β’ Frontend UI on http://localhost:3000"
+echo ""
+echo "Press Ctrl+C to stop both services"
+echo ""
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+
+# Function to cleanup on exit
+cleanup() {
+ echo ""
+ echo "Stopping services..."
+ kill $BACKEND_PID $FRONTEND_PID 2>/dev/null
+ exit 0
+}
+
+trap cleanup INT TERM
+
+# Start backend
+echo "π§ Starting backend..."
+cd src
+export GEMINI_API_KEY=$GEMINI_API_KEY
+export YOUTUBE_API_KEY=$YOUTUBE_API_KEY
+python3 api.py > ../backend.log 2>&1 &
+BACKEND_PID=$!
+cd ..
+
+sleep 3
+
+# Start frontend
+echo "π¨ Starting frontend..."
+cd frontend
+npm run dev > ../frontend.log 2>&1 &
+FRONTEND_PID=$!
+cd ..
+
+sleep 3
+
+echo ""
+echo "β Services started!"
+echo ""
+echo "π‘ Backend: http://localhost:8000"
+echo "π Frontend: http://localhost:3000"
+echo ""
+echo "Logs:"
+echo " Backend: tail -f backend.log"
+echo " Frontend: tail -f frontend.log"
+echo ""
+echo "Press Ctrl+C to stop"
+echo ""
+
+# Wait for services
+wait
+
diff --git a/test_youtube_api.sh b/test_youtube_api.sh
new file mode 100755
index 000000000..5a3d5caa3
--- /dev/null
+++ b/test_youtube_api.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Test YouTube API Integration
+
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo "β β"
+echo "β π§ͺ Testing YouTube API Integration β"
+echo "β β"
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+
+# Test 1: Check .env file exists
+echo "1οΈβ£ Checking configuration..."
+if [ ! -f "src/.env" ]; then
+ echo " β No .env file found in src/"
+ echo " Run: ./setup_youtube_api.sh"
+ exit 1
+else
+ echo " β .env file exists"
+fi
+
+# Test 2: Check environment variables
+echo ""
+echo "2οΈβ£ Checking API keys..."
+cd src
+python3 << PYEOF
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+gemini_key = os.getenv('GEMINI_API_KEY')
+youtube_key = os.getenv('YOUTUBE_API_KEY')
+
+if not gemini_key:
+ print(" β GEMINI_API_KEY not set")
+ exit(1)
+else:
+ print(f" β GEMINI_API_KEY: ...{gemini_key[-8:]}")
+
+if not youtube_key:
+ print(" β οΈ YOUTUBE_API_KEY not set (will use sample data)")
+else:
+ print(f" β YOUTUBE_API_KEY: ...{youtube_key[-8:]}")
+PYEOF
+
+if [ $? -ne 0 ]; then
+ echo ""
+ echo "Fix configuration and try again."
+ exit 1
+fi
+
+cd ..
+
+# Test 3: Check Python dependencies
+echo ""
+echo "3οΈβ£ Checking Python dependencies..."
+
+python3 << PYEOF
+try:
+ import requests
+ print(" β requests")
+except ImportError:
+ print(" β requests (pip install requests)")
+
+try:
+ import flask
+ print(" β flask")
+except ImportError:
+ print(" β flask (pip install flask)")
+
+try:
+ import flask_cors
+ print(" β flask-cors")
+except ImportError:
+ print(" β flask-cors (pip install flask-cors)")
+
+try:
+ import google.generativeai
+ print(" β google-generativeai")
+except ImportError:
+ print(" β google-generativeai (pip install google-generativeai)")
+
+try:
+ from dotenv import load_dotenv
+ print(" β python-dotenv")
+except ImportError:
+ print(" β python-dotenv (pip install python-dotenv)")
+PYEOF
+
+# Test 4: Start backend and test endpoint
+echo ""
+echo "4οΈβ£ Testing backend endpoint..."
+echo " Starting Flask API..."
+
+cd src
+python3 api.py > /tmp/podvibe_test.log 2>&1 &
+API_PID=$!
+cd ..
+
+# Wait for API to start
+sleep 5
+
+# Test the endpoint
+echo " Testing /api/trending..."
+RESPONSE=$(curl -s http://localhost:8000/api/trending)
+
+if echo "$RESPONSE" | grep -q "success"; then
+ echo " β /api/trending working!"
+
+ # Check if using real data or sample data
+ if echo "$RESPONSE" | grep -q '"using_sample_data": false'; then
+ echo " π Using REAL YouTube data!"
+ else
+ echo " βΉοΈ Using sample data (no YouTube API key)"
+ fi
+
+ # Count categories
+ CATEGORIES=$(echo "$RESPONSE" | grep -o '"name"' | wc -l)
+ echo " π Found $CATEGORIES categories"
+else
+ echo " β /api/trending failed"
+ echo " Response: $RESPONSE"
+fi
+
+# Cleanup
+kill $API_PID 2>/dev/null
+
+echo ""
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo " TEST COMPLETE"
+echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+echo ""
+echo "β All tests passed!"
+echo ""
+echo "To start the app:"
+echo " cd src && python3 api.py"
+echo ""
+
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 000000000..238655f2c
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,27 @@
+{
+ "compilerOptions": {
+ // Enable latest features
+ "lib": ["ESNext", "DOM"],
+ "target": "ESNext",
+ "module": "ESNext",
+ "moduleDetection": "force",
+ "jsx": "react-jsx",
+ "allowJs": true,
+
+ // Bundler mode
+ "moduleResolution": "bundler",
+ "allowImportingTsExtensions": true,
+ "verbatimModuleSyntax": true,
+ "noEmit": true,
+
+ // Best practices
+ "strict": true,
+ "skipLibCheck": true,
+ "noFallthroughCasesInSwitch": true,
+
+ // Some stricter flags (disabled by default)
+ "noUnusedLocals": false,
+ "noUnusedParameters": false,
+ "noPropertyAccessFromIndexSignature": false
+ }
+}
diff --git a/vite.config.ts b/vite.config.ts
new file mode 100644
index 000000000..0ce1839d2
--- /dev/null
+++ b/vite.config.ts
@@ -0,0 +1,7 @@
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import tailwindcss from "@tailwindcss/vite";
+
+export default defineConfig({
+ plugins: [react(), tailwindcss()],
+});