diff --git a/.github/workflows/freebuff-e2e.yml b/.github/workflows/freebuff-e2e.yml
new file mode 100644
index 0000000000..acf0a63e3f
--- /dev/null
+++ b/.github/workflows/freebuff-e2e.yml
@@ -0,0 +1,111 @@
+name: Freebuff E2E Tests
+
+on:
+  push:
+    branches: ['main']
+  pull_request:
+    branches: ['main']
+  workflow_dispatch: # Manual trigger
+
+concurrency:
+  group: freebuff-e2e-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-freebuff:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - uses: ./.github/actions/setup-project
+
+      - name: Set environment variables
+        env:
+          SECRETS_CONTEXT: ${{ toJSON(secrets) }}
+        run: |
+          VAR_NAMES=$(bun scripts/generate-ci-env.ts --scope client)
+          echo "$SECRETS_CONTEXT" | jq -r --argjson vars "$VAR_NAMES" '
+            to_entries | .[] | select(.key as $k | $vars | index($k)) | .key + "=" + .value
+          ' >> $GITHUB_ENV
+          echo "FREEBUFF_MODE=true" >> $GITHUB_ENV
+          echo "NEXT_PUBLIC_CB_ENVIRONMENT=prod" >> $GITHUB_ENV
+          echo "CODEBUFF_GITHUB_ACTIONS=true" >> $GITHUB_ENV
+
+      - name: Build Freebuff binary
+        run: bun freebuff/cli/build.ts 0.0.0-e2e
+
+      - name: Smoke test binary
+        run: |
+          chmod +x cli/bin/freebuff
+          cli/bin/freebuff --version
+
+      - name: Upload binary
+        uses: actions/upload-artifact@v7
+        with:
+          name: freebuff-binary
+          path: cli/bin/freebuff
+          retention-days: 1
+
+  e2e:
+    needs: build-freebuff
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        test:
+          - version
+          - startup
+          - help-command
+          - slash-commands
+          - ads-behavior
+          - agent-startup
+          - code-edit
+          - terminal-command
+    name: e2e-${{ matrix.test }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - uses: ./.github/actions/setup-project
+
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
+
+      - name: Download Freebuff binary
+        uses: actions/download-artifact@v4
+        with:
+          name: freebuff-binary
+          path: cli/bin/
+
+      - name: Make binary executable
+        run: chmod +x cli/bin/freebuff
+
+      - name: Set environment variables
+        env:
+          SECRETS_CONTEXT: ${{ toJSON(secrets) }}
+        run: |
+          VAR_NAMES=$(bun scripts/generate-ci-env.ts)
+          echo "$SECRETS_CONTEXT" | jq -r --argjson vars "$VAR_NAMES" '
+            to_entries | .[] | select(.key as $k | $vars | index($k)) | .key + "=" + .value
+          ' >> $GITHUB_ENV
+          echo "CODEBUFF_GITHUB_ACTIONS=true" >> $GITHUB_ENV
+          echo "NEXT_PUBLIC_CB_ENVIRONMENT=test" >> $GITHUB_ENV
+          echo "CODEBUFF_GITHUB_TOKEN=${{ secrets.CODEBUFF_GITHUB_TOKEN }}" >> $GITHUB_ENV
+          echo "CODEBUFF_API_KEY=${{ secrets.CODEBUFF_API_KEY }}" >> $GITHUB_ENV
+
+      - name: Build SDK
+        run: cd sdk && bun run build
+
+      - name: Run e2e test - ${{ matrix.test }}
+        run: bun test freebuff/e2e/tests/${{ matrix.test }}.e2e.test.ts --timeout=120000
+
+      - name: Upload tmux session logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: tmux-logs-${{ matrix.test }}
+          path: debug/tmux-sessions/
+          retention-days: 7
diff --git a/cli/release/package.json b/cli/release/package.json
index 6da3d70989..f51779ae8b 100644
--- a/cli/release/package.json
+++ b/cli/release/package.json
@@ -1,6 +1,6 @@
 {
   "name": "codebuff",
-  "version": "1.0.630",
+  "version": "1.0.631",
   "description": "AI coding agent",
   "license": "MIT",
   "bin": {
diff --git a/common/src/types/session-state.ts b/common/src/types/session-state.ts
index f4ac626747..3896f87886 100644
--- a/common/src/types/session-state.ts
+++ b/common/src/types/session-state.ts
@@ -68,6 +68,7 @@ export const AgentOutputSchema = z.discriminatedUnion('type', [
     type: z.literal('error'),
     message: z.string(),
     statusCode: z.number().optional(),
+    error: z.string().optional(),
   }),
 ])
 export type AgentOutput = z.infer<typeof AgentOutputSchema>
diff --git a/common/src/util/error.ts b/common/src/util/error.ts
index 188df1ca9c..1861e1d399 100644
--- a/common/src/util/error.ts
+++ b/common/src/util/error.ts
@@ -187,6 +187,35 @@ export function unwrapPromptResult<T>(result: PromptResult<T>): T {
   return result.value
 }
 
+/**
+ * Parses a JSON response body string from an API error to extract structured error details.
+ * Used to extract machine-readable error codes and human-readable messages from API responses
+ * (e.g., AI SDK's APICallError includes a responseBody with the server's JSON response).
+ *
+ * Returns extracted fields, or an empty object if the responseBody is not a valid JSON string
+ * with the expected shape.
+ */
+export function parseApiErrorResponseBody(responseBody: unknown): {
+  errorCode?: string
+  message?: string
+} {
+  if (typeof responseBody !== 'string') return {}
+  try {
+    const parsed: unknown = JSON.parse(responseBody)
+    if (!parsed || typeof parsed !== 'object') return {}
+    const result: { errorCode?: string; message?: string } = {}
+    if ('error' in parsed && typeof (parsed as { error: unknown }).error === 'string') {
+      result.errorCode = (parsed as { error: string }).error
+    }
+    if ('message' in parsed && typeof (parsed as { message: unknown }).message === 'string') {
+      result.message = (parsed as { message: string }).message
+    }
+    return result
+  } catch {
+    return {}
+  }
+}
+
 // Extended error properties that various libraries add to Error objects
 interface ExtendedErrorProperties {
   status?: number
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 0000000000..7e2adb3e89
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,244 @@
+# Architecture Overview
+
+Codebuff is a TypeScript monorepo (Bun workspaces) that provides an AI-powered coding assistant via a CLI, SDK, and web API.
+
+## Package Dependency Graph
+
+```
+                                  ┌──────────┐
+                                  │   cli/   │  TUI client (OpenTUI + React)
+                                  └────┬─────┘
+                                       │
+                                  ┌────▼─────┐
+                          ┌───────│   sdk/   │  JS/TS SDK
+                          │       └────┬─────┘
+                          │            │
+                  ┌───────▼────────┐   │
+                  │ agent-runtime/ │◄──┘  Agent execution engine
+                  └───────┬────────┘
+                          │
+          ┌───────────────┼───────────────┐
+          │               │               │
+    ┌─────▼─────┐   ┌─────▼─────┐   ┌─────▼─────┐
+    │  agents/  │   │  common/  │   │ internal/ │
+    └───────────┘   └─────┬─────┘   └─────┬─────┘
+                          │               │
+                    ┌─────┼─────┐   ┌─────┼─────────┐
+                    │     │     │   │     │         │
+               billing/ bigquery/ code-map/    web/
+```
+
+## Packages
+
+### `cli/` — TUI Client
+
+The user-facing terminal UI, built with [OpenTUI](https://github.com/nickhudkins/opentui) (a React renderer for terminals) and React hooks.
+
+- **Entry point:** `src/index.tsx` → `src/app.tsx` → `src/chat.tsx`
+- **Key responsibilities:**
+  - Renders the chat interface, agent output, tool call results, and status indicators
+  - Manages user input, slash commands (`/help`, `/usage`), and agent mode selection (DEFAULT, MAX, PLAN)
+  - Handles authentication (login polling, OAuth), session persistence, and chat history
+  - Calls `client.run()` from the SDK and processes streaming events
+- **Depends on:** `sdk`, `common`
+
+### `sdk/` — JavaScript/TypeScript SDK
+
+The public SDK used by the CLI and available to external users via `@codebuff/sdk` on npm.
+
+- **Entry point:** `src/client.ts` (`CodebuffClient`) → `src/run.ts` (`run()`)
+- **Key responsibilities:**
+  - Orchestrates agent runs: initializes session state, registers tool handlers, calls `callMainPrompt()`
+  - **Executes tool calls locally** on the user's machine (file edits, terminal commands, code search)
+  - Manages model provider selection: Claude OAuth, ChatGPT OAuth, or Codebuff backend
+  - Handles credentials, retry logic, and error transformation
+- **Depends on:** `agent-runtime`, `common`, `internal` (for OpenAI-compatible provider)
+
+### `packages/agent-runtime/` — Agent Execution Engine
+
+The core agent loop that drives LLM inference, tool execution, and multi-step reasoning.
+
+- **Entry point:** `src/main-prompt.ts` → `src/run-agent-step.ts` (`loopAgentSteps()`)
+- **Key responsibilities:**
+  - Runs the agent loop: LLM call → process response → execute tool calls → repeat
+  - Manages agent templates, system prompts, and tool definitions
+  - Handles subagent spawning, programmatic agent steps (`handleSteps` generators)
+  - Processes the AI SDK stream (`streamText()`) and routes tool calls to the SDK
+  - Manages context token counting, cache debugging, and cost tracking
+- **Depends on:** `common`, `agents` (for agent templates)
+
+### `common/` — Shared Library
+
+Shared types, utilities, constants, and tool definitions used across the entire monorepo.
+
+- **Key areas:**
+  - `src/types/` — TypeScript types: `SessionState`, `AgentOutput`, `Message`, contracts for DI
+  - `src/tools/` — Tool parameter schemas (Zod), tool names, and tool call validation
+  - `src/constants/` — Model configs, agent IDs, OAuth settings, billing constants
+  - `src/util/` — Error handling (`ErrorOr<T>`), message utilities, string helpers, XML parsing
+  - `src/templates/` — Agent definition types, initial `.agents/` directory template
+  - `src/testing/` — Mock factories for database, filesystem, analytics, fetch, timers
+- **Depends on:** nothing (leaf package)
+
+### `agents/` — Agent Definitions
+
+Prompt-based and programmatic agent definitions that ship with Codebuff.
+
+- **Key agents:**
+  - `base2/` — The default agent (base2, base2-max, base2-free, base2-plan)
+  - `editor/` — Code editing specialist with best-of-N selection
+  - `file-explorer/` — File picker, code searcher, directory lister, glob matcher
+  - `thinker/` — Deep reasoning agent with best-of-N variants
+  - `reviewer/` — Code review agent with multi-prompt variant
+  - `researcher/` — Web search and docs search agents
+  - `general-agent/` — General-purpose agents (opus-agent, gpt-5-agent)
+  - `commander.ts` / `commander-lite.ts` — Terminal command execution agents
+  - `context-pruner.ts` — Conversation summarization to manage context length
+- **Depends on:** `common` (for agent definition types and tool params)
+
+### `web/` — Next.js Web Application
+
+The Codebuff web server, marketing site, and API.
+
+- **Key areas:**
+  - `src/app/api/v1/chat/completions/` — The main LLM proxy endpoint (routes to OpenRouter, Fireworks, OpenAI)
+  - `src/app/api/v1/` — REST API: agent runs, feedback, usage, web search, docs search, token count
+  - `src/app/api/auth/` — NextAuth.js authentication (GitHub OAuth)
+  - `src/app/api/stripe/` — Billing: credit purchases, subscriptions, webhooks
+  - `src/app/api/agents/` — Agent registry: publish, validate, fetch
+  - `src/app/api/orgs/` — Organization management: teams, billing, repos
+  - `src/app/` — Marketing pages, docs (MDX via contentlayer), user profile, pricing
+  - `src/llm-api/` — LLM provider integrations (OpenRouter, Fireworks, OpenAI, SiliconFlow, CanopyWave)
+- **Depends on:** `common`, `internal`, `billing`, `bigquery`
+
+### `packages/internal/` — Internal Utilities
+
+Server-side utilities, database schema, and vendor forks shared between `web` and `sdk`.
+
+- **Key areas:**
+  - `src/db/` — Drizzle ORM schema (`schema.ts`), migrations, Docker Compose for local Postgres
+  - `src/env.ts` — Server environment variable validation (@t3-oss/env-nextjs)
+  - `src/loops/` — Loops email service integration (transactional emails)
+  - `src/openai-compatible/` — Forked OpenAI-compatible AI SDK provider (used by the SDK to call the Codebuff backend)
+  - `src/openrouter-ai-sdk/` — Forked OpenRouter AI SDK provider (used by the web server)
+  - `src/templates/` — Agent template fetching and validation
+- **Depends on:** `common`
+
+### `packages/billing/` — Billing & Credits
+
+Credit management, subscription handling, and usage tracking.
+
+- **Key components:**
+  - `balance-calculator.ts` — Credit balance calculation (free, purchased, rollover, subscription grants)
+  - `subscription.ts` — Subscription plan management, block grants, weekly limits
+  - `grant-credits.ts` — Credit grant operations (referral, purchase, admin, free)
+  - `auto-topup.ts` — Automatic credit purchases when balance is low
+  - `usage-service.ts` — Usage data aggregation
+  - `credit-delegation.ts` — Organization credit delegation
+- **Depends on:** `common` (for DB access, Stripe utils, types)
+
+### `packages/bigquery/` — Analytics Data
+
+Google BigQuery integration for storing agent interaction traces and usage analytics.
+
+- **Tables:** `traces` (agent interactions), `relabels` (fine-tuning relabeling data)
+- **Trace types:** file selection calls, file trees, agent responses, training data, model grading
+- **Depends on:** `common`
+
+### `packages/code-map/` — Code Parsing
+
+Tree-sitter based source code parser that extracts function/variable names for file tree display.
+
+- **Supports:** TypeScript, JavaScript, Python, Go, Rust, Java, C, C++, C#, Ruby, PHP
+- **Used by:** The `read_subtree` tool to show parsed variable names alongside the file tree
+- **Depends on:** nothing (leaf package)
+
+### `packages/build-tools/` — Build Utilities
+
+Custom build executors, currently just the Infisical secrets integration.
+
+### `.agents/` — Local Agent Templates
+
+Project-specific agent definitions for this repository. These are loaded automatically by the agent runtime.
+
+- CLI agent templates (claude-code-cli, codex-cli, gemini-cli, codebuff-local-cli)
+- Notion query agents
+- Skills (cleanup, meta, review)
+
+### `evals/` — Evaluation Framework
+
+BuffBench evaluation suite for measuring agent performance on real-world coding tasks.
+
+- **Workflow:** Pick commits → generate eval tasks → run agents → judge results → extract lessons
+- **Runners:** Codebuff, Claude Code, Codex
+- **Depends on:** `common`, `agent-runtime`, `sdk`
+
+### `freebuff/` — Free Tier Product
+
+A separate free-to-use version of Codebuff with its own CLI binary and web app.
+
+- `freebuff/cli/` — Standalone CLI binary and release scripts
+- `freebuff/web/` — Minimal Next.js app for auth (login, onboarding)
+- Uses ChatGPT OAuth for free LLM access (no Codebuff credits required)
+
+### `scripts/` — Development & Operations
+
+Developer tooling, analytics scripts, and service management.
+
+- `start-services.ts` / `stop-services.ts` / `status-services.ts` — Local dev environment management
+- `tmux/` — tmux helper scripts for CLI E2E testing
+- Analytics: DAU calculation, MRR, subscriber profitability, model usage
+- Release: changelog generation, credit grants, worktree management
+
+## Key Architectural Patterns
+
+### Dependency Injection via Contracts
+
+The codebase avoids tight coupling between packages using contract types in `common/src/types/contracts/`:
+
+- `database.ts` — DB access functions (`GetUserInfoFromApiKeyFn`, `StartAgentRunFn`, etc.)
+- `llm.ts` — LLM calling functions (`PromptAiSdkStreamFn`, `PromptAiSdkFn`)
+- `analytics.ts` — Event tracking (`TrackEventFn`)
+- `client.ts` — Client-server communication (`RequestToolCallFn`, `SendActionFn`)
+- `env.ts` — Environment variable access (`BaseEnv`, `ClientEnv`, `CiEnv`)
+
+This allows the agent-runtime to be used by both the SDK (local execution) and the web server (if needed) without direct dependencies.
+
+### ErrorOr Pattern
+
+Prefer `ErrorOr<T>` return values (`success(value)` / `failure(error)`) over throwing exceptions. Defined in `common/src/util/error.ts`.
+
+### Local Tool Execution
+
+Tool calls (file edits, terminal commands, code search) execute **on the user's machine** via the SDK, not on the server. The agent-runtime sends tool call requests through `requestToolCall`, which the SDK handles locally.
+
+### AI SDK Integration
+
+The project uses Vercel's [AI SDK](https://sdk.vercel.ai/) (`ai` package) for LLM interactions:
+
+- `streamText()` for streaming responses
+- `generateText()` / `generateObject()` for non-streaming
+- Custom `OpenAICompatibleChatLanguageModel` provider for the Codebuff backend
+- `APICallError` for HTTP error handling (see [Error Schema](./error-schema.md))
+
+### Agent Template System
+
+Agents are defined as templates with:
+
+- **Prompt agents** — System prompt + tool list + spawnable subagents
+- **Programmatic agents** — `handleSteps` generator functions that run in a sandbox
+- Templates live in `agents/` (shipped) and `.agents/` (project-local)
+- Users can publish agents to the Codebuff registry
+
+## Development
+
+```bash
+bun up          # Start web server + database
+bun start-cli   # Start CLI (separate terminal)
+bun ps          # Check running services
+bun down        # Stop services
+bun typecheck   # Run all type checks
+bun test        # Run all tests
+```
+
+See the [Request Flow](./request-flow.md) doc for the detailed path a prompt takes through the system.
diff --git a/docs/error-schema.md b/docs/error-schema.md
new file mode 100644
index 0000000000..8cc9b088b5
--- /dev/null
+++ b/docs/error-schema.md
@@ -0,0 +1,213 @@
+# Error Schema: Server Responses & Client Handling
+
+This document describes the error responses the Codebuff server sends, how the AI SDK transforms them, and how errors are ultimately displayed in the CLI.
+
+## Server Error Responses
+
+**Source:** `web/src/app/api/v1/chat/completions/_post.ts`
+
+The server returns JSON error responses with an HTTP status code. There are two shapes:
+
+### Simple errors (message only)
+
+```json
+{ "message": "<human-readable message>" }
+```
+
+Used for:
+
+| Status | Example message |
+|--------|----------------|
+| 400 | `"Invalid JSON in request body"` |
+| 400 | `"No runId found in request body"` |
+| 401 | `"Unauthorized"` |
+| 401 | `"Invalid Codebuff API key"` |
+| 402 | `"Out of credits. Please add credits at https://codebuff.com/usage. Your free credits reset in 3 hours."` |
+
+### Typed errors (error code + message)
+
+```json
+{ "error": "<machine-readable code>", "message": "<human-readable message>" }
+```
+
+Used for errors that the client needs to identify programmatically:
+
+| Status | `error` code | Example `message` |
+|--------|-------------|-------------------|
+| 403 | `account_suspended` | `"Your account has been suspended due to billing issues. Please contact support@codebuff.com to resolve this."` |
+| 403 | `free_mode_unavailable` | `"Free mode is not available in your country."` |
+| 429 | `rate_limit_exceeded` | `"Subscription weekly limit reached. Your limit resets in 2 hours. Enable 'Continue with credits' in the CLI to use a-la-carte credits."` |
+
+### Catch-all server error
+
+```json
+{ "error": "Failed to process request" }
+```
+
+The 500 catch-all uses `error` as a human-readable string (no `message` field). This does not follow the typed error pattern above — it's a legacy format.
+
+### Provider errors
+
+When the upstream LLM provider (OpenRouter, Fireworks, OpenAI, etc.) returns an error, the server passes it through via the provider's `.toJSON()` format, which varies by provider.
+
+## The AI SDK Transformation Problem
+
+The Codebuff backend is called through the AI SDK's `OpenAICompatibleChatLanguageModel`, which treats it as a standard OpenAI-compatible endpoint. When the server returns a non-2xx response, **the AI SDK wraps it** into an `APICallError`:
+
+```
+Server returns:   HTTP 403  { "error": "free_mode_unavailable", "message": "Free mode is not available in your country." }
+                      │
+                      ▼
+AI SDK creates:   APICallError {
+                    message: "Forbidden"              ← HTTP status text (NOT the server's message)
+                    statusCode: 403
+                    responseBody: "{\"error\":\"free_mode_unavailable\",\"message\":\"Free mode is not available in your country.\"}"  ← original JSON as a string
+                  }
+```
+
+The server's human-readable `message` and machine-readable `error` code are buried inside `responseBody` as a JSON string. The `APICallError.message` is just the HTTP status text ("Forbidden", "Payment Required", etc.).
+
+## Client-Side Error Recovery
+
+To recover the server's structured error details, we use `parseApiErrorResponseBody()` from `common/src/util/error.ts`:
+
+```typescript
+export function parseApiErrorResponseBody(responseBody: unknown): {
+  errorCode?: string
+  message?: string
+}
+```
+
+This is called in two places:
+
+### 1. Agent Runtime catch block
+
+**File:** `packages/agent-runtime/src/run-agent-step.ts` (in `loopAgentSteps`)
+
+This is the **primary** error handler. Most API errors are caught here because the error occurs during `runAgentStep()` → `promptAiSdkStream()` → `streamText()`.
+
+```typescript
+catch (error) {
+  if (error instanceof APICallError) {
+    const parsed = parseApiErrorResponseBody(error.responseBody)
+    // parsed.errorCode = 'free_mode_unavailable'
+    // parsed.message = 'Free mode is not available in your country.'
+  }
+  // ...
+  return {
+    output: {
+      type: 'error',
+      message: hasServerMessage ? errorMessage : 'Agent run error: ' + errorMessage,
+      statusCode,
+      error: errorCode,   // ← machine-readable code for client matching
+    },
+  }
+}
+```
+
+### 2. SDK .catch() handler
+
+**File:** `sdk/src/run.ts` (in `callMainPrompt().catch()`)
+
+This is a **fallback** handler for errors that escape the agent runtime (e.g., errors during setup before the agent loop starts).
+
+## Error Output Schema
+
+**File:** `common/src/types/session-state.ts`
+
+The `AgentOutputSchema` defines the Zod schema for agent output. The error variant:
+
+```typescript
+z.object({
+  type: z.literal('error'),
+  message: z.string(),
+  statusCode: z.number().optional(),
+  error: z.string().optional(),       // machine-readable error code
+})
+```
+
+All three fields flow through to the CLI.
+
+## CLI Error Handling
+
+**Files:** `cli/src/utils/error-handling.ts`, `cli/src/hooks/helpers/send-message.ts`
+
+The CLI checks the output for known error types:
+
+```typescript
+// Checks statusCode === 402
+isOutOfCreditsError(output)       → shows OUT_OF_CREDITS_MESSAGE
+
+// Checks statusCode === 403 && error === 'free_mode_unavailable'
+isFreeModeUnavailableError(output) → shows FREE_MODE_UNAVAILABLE_MESSAGE
+```
+
+For all other errors, the raw `output.message` is displayed in the `UserErrorBanner`.
+
+## Error Flow Diagram
+
+```
+  Server                    AI SDK                  Agent Runtime              SDK                    CLI
+    │                         │                         │                       │                      │
+    │  HTTP 403               │                         │                       │                      │
+    │  { error, message }     │                         │                       │                      │
+    │────────────────────────▶│                         │                       │                      │
+    │                         │  APICallError           │                       │                      │
+    │                         │  .message="Forbidden"   │                       │                      │
+    │                         │  .responseBody="{...}"  │                       │                      │
+    │                         │────────────────────────▶│                       │                      │
+    │                         │                         │  catch (APICallError) │                      │
+    │                         │                         │  parseResponseBody()  │                      │
+    │                         │                         │  extract error code   │                      │
+    │                         │                         │  extract message      │                      │
+    │                         │                         │─────────────────────▶ │                      │
+    │                         │                         │  prompt-response      │                      │
+    │                         │                         │  { type: 'error',     │                      │
+    │                         │                         │    statusCode: 403,   │                      │
+    │                         │                         │    error: '...',      │                      │
+    │                         │                         │    message: '...' }   │                      │
+    │                         │                         │                       │─────────────────────▶│
+    │                         │                         │                       │  handleRunCompletion  │
+    │                         │                         │                       │  isFreeModeUnavail..  │
+    │                         │                         │                       │  show friendly msg    │
+```
+
+## Adding a New Server Error Type
+
+To add a new error type that the CLI can identify and handle specially:
+
+1. **Server** (`web/src/app/api/v1/chat/completions/_post.ts`): Return a typed error:
+   ```typescript
+   return NextResponse.json(
+     { error: 'your_error_code', message: 'User-friendly message.' },
+     { status: 4xx },
+   )
+   ```
+
+2. **CLI error detection** (`cli/src/utils/error-handling.ts`): Add a checker:
+   ```typescript
+   export const isYourError = (error: unknown): boolean => {
+     if (
+       error &&
+       typeof error === 'object' &&
+       'statusCode' in error &&
+       (error as { statusCode: unknown }).statusCode === 4xx &&
+       'error' in error &&
+       (error as { error: unknown }).error === 'your_error_code'
+     ) {
+       return true
+     }
+     return false
+   }
+   ```
+
+3. **CLI display** (`cli/src/hooks/helpers/send-message.ts`): Handle it in `handleRunCompletion`:
+   ```typescript
+   if (isYourError(output)) {
+     updater.setError(YOUR_ERROR_MESSAGE)
+     finalizeAfterError()
+     return
+   }
+   ```
+
+No changes needed in the agent runtime or SDK — `parseApiErrorResponseBody` automatically extracts any `error` and `message` fields from the server's response body.
diff --git a/docs/request-flow.md b/docs/request-flow.md
new file mode 100644
index 0000000000..427611525f
--- /dev/null
+++ b/docs/request-flow.md
@@ -0,0 +1,180 @@
+# Request Flow: CLI → Server → CLI
+
+This document traces the exact path a user prompt takes from the Codebuff CLI through the SDK, agent runtime, server, and back.
+
+## Overview
+
+```
+┌─────────┐    ┌─────────┐    ┌───────────────┐    ┌────────────────┐    ┌──────────┐
+│   CLI   │───▶│   SDK   │───▶│ Agent Runtime │───▶│ Codebuff Server│───▶│ LLM API  │
+│  (TUI)  │◀───│ run.ts  │◀───│ loopAgentSteps│◀───│  /v1/chat/...  │◀───│(OR/OAI/..)│
+└─────────┘    └─────────┘    └───────────────┘    └────────────────┘    └──────────┘
+```
+
+## Step-by-Step Flow
+
+### 1. CLI: User Input
+
+**Files:** `cli/src/hooks/use-send-message.ts`, `cli/src/hooks/helpers/send-message.ts`
+
+1. User types a prompt and hits Enter.
+2. `prepareUserMessage()` processes the input:
+   - Collects pending bash context (terminal output since last prompt)
+   - Processes image and text attachments
+   - Creates a user message in the chat UI
+3. `setupStreamingContext()` initializes:
+   - An `AbortController` (for user cancellation via Escape)
+   - A timer (tracks elapsed time)
+   - A batched message updater (efficiently updates the UI)
+4. The CLI calls `client.run()` from the SDK.
+
+### 2. SDK: Orchestration
+
+**File:** `sdk/src/run.ts`
+
+1. `run()` → `runOnce()` is called with the prompt, agent ID, cost mode, and session state.
+2. **Session state** is initialized (fresh) or restored (from `previousRun`).
+3. **User identity** is verified via `getUserInfoFromApiKey()` (calls the web API).
+4. **Tool handlers** are registered — these execute locally on the user's machine:
+   - `write_file`, `str_replace`, `apply_patch` → file edits
+   - `run_terminal_command` → shell commands
+   - `code_search`, `glob`, `list_directory` → file search
+   - `read_files` → file reading
+   - Custom tool definitions and MCP tools
+5. **Action handlers** are registered to process server responses:
+   - `response-chunk` → streams text to the CLI
+   - `subagent-response-chunk` → streams subagent output
+   - `prompt-response` → final result (resolves the promise)
+   - `prompt-error` → error result
+6. `callMainPrompt()` is called (fire-and-forget, with a `.catch()` handler).
+7. The function returns a promise that resolves when `prompt-response` or an error arrives.
+
+### 3. Agent Runtime: Main Prompt
+
+**File:** `packages/agent-runtime/src/main-prompt.ts`
+
+1. `callMainPrompt()` resets credits to 0 (server controls cost tracking).
+2. Assembles **local agent templates** from the project's `.agents/` directory.
+3. Sends a `response-chunk` `start` event to the CLI.
+4. `mainPrompt()` determines the **agent type** based on cost mode:
+   - `free` → `base-free`
+   - `normal` → `base`
+   - `max` → `base-max`
+   - `ask` → `ask`
+   - `experimental` → `base2`
+   - Fallback (default) → `base2`
+   - Or a custom agent ID
+5. Calls `loopAgentSteps()` with the agent template, prompt, and session state.
+
+### 4. Agent Runtime: Agent Loop
+
+**File:** `packages/agent-runtime/src/run-agent-step.ts`
+
+1. `loopAgentSteps()` starts an **agent run** (recorded in the database).
+2. Builds the **system prompt**, **tool definitions**, and **initial messages**.
+3. Enters the main loop:
+   ```
+   while (true) {
+     // 1. Run programmatic step (if agent has handleSteps)
+     // 2. Check if turn should end
+     // 3. Call runAgentStep() for LLM inference
+     // 4. Process tool calls and responses
+   }
+   ```
+4. Each `runAgentStep()` call:
+   - Checks context token count via the `/api/v1/token-count` endpoint
+   - Calls `getAgentStreamFromTemplate()` → `promptAiSdkStream()`
+   - `processStream()` iterates over the AI SDK stream, handling text chunks and tool calls
+   - Tool calls are sent back to the SDK via `requestToolCall`, executed locally, and results fed back
+5. The loop continues until the agent signals completion (no more tool calls, or `task_completed` tool).
+6. Sends a `response-chunk` `finish` event, then a `prompt-response` action with the final session state and output.
+
+### 5. LLM Call: Model Provider Selection
+
+**Files:** `sdk/src/impl/llm.ts`, `sdk/src/impl/model-provider.ts`
+
+`promptAiSdkStream()` selects the model provider:
+
+1. **Claude OAuth** — If the user has connected their Claude subscription and the model is a Claude model, requests go directly to `api.anthropic.com` using the user's OAuth token. Zero cost to the user's Codebuff credits.
+2. **ChatGPT OAuth** — If the user has connected their ChatGPT subscription and the model is an OpenAI model, requests go to the ChatGPT backend API.
+3. **Codebuff Backend** (default) — Requests go to `POST /api/v1/chat/completions` on the Codebuff web server, which routes to the appropriate LLM provider.
+
+For OAuth providers, rate limit errors trigger automatic fallback to the Codebuff backend (unless in free mode).
+
+The AI SDK's `streamText()` function handles the actual HTTP call, streaming, and retry logic.
+
+### 6. Server: Chat Completions Endpoint
+
+**File:** `web/src/app/api/v1/chat/completions/_post.ts`
+
+The server processes the request through several validation gates:
+
+1. **Parse request body** — Returns 400 if invalid JSON.
+2. **Authenticate** — Extracts API key from `Authorization` header. Returns 401 if missing/invalid.
+3. **Check ban status** — Returns 403 `account_suspended` if user is banned.
+4. **Free mode country check** — For free mode requests, checks user's IP against allowed countries. Returns 403 `free_mode_unavailable` if not allowed.
+5. **Validate agent run** — Checks the `run_id` exists and is in `running` status. Returns 400 if invalid.
+6. **Subscription block grant** — For subscribers, ensures a billing block is active. Returns 429 `rate_limit_exceeded` if limit hit and fallback disabled.
+7. **Credit check** — Returns 402 if user has no remaining credits (and not a free mode request).
+8. **Route to LLM provider** — Based on the model, routes to:
+   - Fireworks AI (for supported models)
+   - OpenAI direct (for OpenAI models)
+   - OpenRouter (default, for all other models)
+9. **Return response** — Streaming requests return an SSE stream (`text/event-stream`). Non-streaming requests return JSON.
+
+### 7. Response Flow Back to CLI
+
+1. The LLM provider streams tokens back to the server.
+2. The server forwards the SSE stream to the AI SDK client.
+3. `promptAiSdkStream()` yields chunks from the AI SDK's `fullStream`:
+   - `text-delta` → text content
+   - `tool-call` → tool invocation
+   - `error` → error handling (OAuth fallback, retries, etc.)
+4. `processStream()` in agent-runtime handles each chunk:
+   - Text chunks → `sendAction({ type: 'response-chunk', chunk })` → SDK → CLI UI
+   - Tool calls → `requestToolCall()` → SDK executes locally → result fed back to stream
+5. When the agent loop finishes, `callMainPrompt` sends:
+   - A `response-chunk` `finish` event (with total cost)
+   - A `prompt-response` action (with final session state and output)
+6. The SDK's `handlePromptResponse()` validates the output against `AgentOutputSchema` and resolves the promise.
+7. The CLI's `handleRunCompletion()` processes the result:
+   - Checks for known error types (out of credits, free mode unavailable)
+   - Updates the UI with completion time and credit cost
+   - Marks the message as complete
+
+## Tool Call Lifecycle
+
+Tool calls execute **locally on the user's machine**, not on the server:
+
+```
+LLM Response (tool_call)            Agent Runtime processes stream
+        │                                    │
+        ▼                                    ▼
+  processStream()  ─── requestToolCall ──▶  SDK run.ts
+        │                                    │
+        │                              handleToolCall()
+        │                                    │
+        │                              Executes locally
+        │                              (file edit, terminal, search)
+        │                                    │
+        ◀─────── tool result ───────────────┘
+        │
+  Feeds result back into next LLM call
+```
+
+## Session State
+
+Session state persists across prompts within a conversation:
+
+- `sessionState.mainAgentState.messageHistory` — Full conversation history
+- `sessionState.fileContext` — Project files, knowledge files, custom tools
+- The CLI stores the `RunState` from each run and passes it as `previousRun` to the next `client.run()` call
+
+## Cancellation
+
+When the user presses Escape:
+
+1. CLI aborts the `AbortController`
+2. The `abort` signal propagates through the SDK → agent runtime → AI SDK
+3. `loopAgentSteps` catches the `AbortError`, marks the run as `cancelled`
+4. CLI's abort handler shows an interruption notice and marks the message complete
diff --git a/freebuff/cli/release/package.json b/freebuff/cli/release/package.json
index c893ed5cab..f5302ff59c 100644
--- a/freebuff/cli/release/package.json
+++ b/freebuff/cli/release/package.json
@@ -1,6 +1,6 @@
 {
   "name": "freebuff",
-  "version": "0.0.14",
+  "version": "0.0.15",
   "description": "The world's strongest free coding agent",
   "license": "MIT",
   "bin": {
diff --git a/freebuff/e2e/README.md b/freebuff/e2e/README.md
new file mode 100644
index 0000000000..861d31f5be
--- /dev/null
+++ b/freebuff/e2e/README.md
@@ -0,0 +1,169 @@
+# Freebuff E2E Tests
+
+End-to-end tests for the Freebuff CLI binary. Tests verify that the compiled binary works correctly by interacting with it via tmux.
+
+## Architecture
+
+Two testing approaches are supported:
+
+### 1. Direct tmux tests (fast, deterministic)
+
+Use the `FreebuffSession` class to start the binary in tmux, send commands, capture output, and assert directly.
+
+```typescript
+import { describe, test, expect, afterEach } from 'bun:test'
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+describe('My Feature', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) await session.stop()
+    session = null
+  })
+
+  test('works correctly', async () => {
+    const binary = requireFreebuffBinary()
+    session = await FreebuffSession.start(binary)
+
+    await session.send('/help')
+    const output = await session.capture(2)
+
+    expect(output).toContain('Shortcuts')
+  }, 60_000)
+})
+```
+
+### 2. SDK agent-driven tests (AI-powered verification)
+
+Use the Codebuff SDK to run a testing agent that interacts with Freebuff via custom tmux tools. The agent reasons about the CLI output and verifies complex behaviors.
+
+```typescript
+import { describe, test, expect, afterEach } from 'bun:test'
+import { CodebuffClient } from '@codebuff/sdk'
+import { freebuffTesterAgent } from '../agent/freebuff-tester'
+import { createFreebuffTmuxTools, requireFreebuffBinary } from '../utils'
+
+describe('Agent Test', () => {
+  let cleanup: (() => Promise<void>) | null = null
+
+  afterEach(async () => {
+    if (cleanup) await cleanup()
+    cleanup = null
+  })
+
+  test('verifies startup', async () => {
+    const apiKey = process.env.CODEBUFF_API_KEY
+    if (!apiKey) return // Skip if no API key
+
+    const binary = requireFreebuffBinary()
+    const tmuxTools = createFreebuffTmuxTools(binary)
+    cleanup = tmuxTools.cleanup
+
+    const client = new CodebuffClient({ apiKey })
+    const result = await client.run({
+      agent: freebuffTesterAgent.id,
+      prompt: 'Start Freebuff and verify the branding is correct.',
+      agentDefinitions: [freebuffTesterAgent],
+      customToolDefinitions: tmuxTools.tools,
+      handleEvent: () => {},
+    })
+
+    expect(result.output.type).not.toBe('error')
+  }, 180_000)
+})
+```
+
+## Prerequisites
+
+- **tmux** must be installed: `brew install tmux` (macOS) or `sudo apt-get install tmux` (Ubuntu)
+- **Freebuff binary** must be built: `bun freebuff/cli/build.ts 0.0.0-dev`
+- **SDK built** (for agent tests): `cd sdk && bun run build`
+- **CODEBUFF_API_KEY** (for agent tests only): Set this environment variable
+
+## Running Tests
+
+### Build the binary first
+
+```bash
+bun freebuff/cli/build.ts 0.0.0-dev
+```
+
+### Run all tests
+
+```bash
+bun test freebuff/e2e/tests/
+```
+
+### Run a specific test
+
+```bash
+bun test freebuff/e2e/tests/version.e2e.test.ts
+bun test freebuff/e2e/tests/startup.e2e.test.ts
+bun test freebuff/e2e/tests/help-command.e2e.test.ts
+bun test freebuff/e2e/tests/agent-startup.e2e.test.ts
+```
+
+### Use a custom binary path
+
+```bash
+FREEBUFF_BINARY=/path/to/freebuff bun test freebuff/e2e/tests/
+```
+
+## Adding New Tests
+
+1. Create a new file in `freebuff/e2e/tests/` with the naming convention `<feature>.e2e.test.ts`
+2. Add the test name to `.github/workflows/freebuff-e2e.yml` matrix:
+
+```yaml
+matrix:
+  test:
+    - version
+    - startup
+    - help-command
+    - agent-startup
+    - your-new-test    # <-- add here
+```
+
+3. The test will automatically run in parallel with other tests in CI.
+
+## CI Workflow
+
+The `.github/workflows/freebuff-e2e.yml` workflow:
+
+1. **Builds** the Freebuff binary once (linux-x64)
+2. **Runs each test file in parallel** via GitHub Actions matrix strategy
+3. **Uploads tmux session logs** on failure for debugging
+
+Triggers:
+- **Nightly** at 6:00 AM PT
+- **Manual** via workflow_dispatch
+
+## Utilities Reference
+
+### `FreebuffSession`
+
+| Method | Description |
+|--------|-------------|
+| `FreebuffSession.start(binaryPath)` | Start binary in tmux, returns session |
+| `session.send(text)` | Send text input (presses Enter) |
+| `session.sendKey(key)` | Send special key (e.g. `'C-c'`, `'Escape'`) |
+| `session.capture(waitSec?)` | Capture terminal output |
+| `session.captureLabeled(label, waitSec?)` | Capture and save to session logs |
+| `session.waitForText(pattern, timeoutMs?)` | Poll until text appears |
+| `session.stop()` | Stop session and clean up |
+
+### `createFreebuffTmuxTools(binaryPath)`
+
+Creates SDK custom tools for agent-driven testing:
+- `start_freebuff` - Launch the CLI
+- `send_to_freebuff` - Send text input
+- `capture_freebuff_output` - Capture terminal output
+- `stop_freebuff` - Stop and clean up
+
+### Helper functions
+
+| Function | Description |
+|----------|-------------|
+| `requireFreebuffBinary()` | Get binary path, throws if not found |
+| `getFreebuffBinaryPath()` | Get binary path (may not exist) |
diff --git a/freebuff/e2e/agent/freebuff-tester.ts b/freebuff/e2e/agent/freebuff-tester.ts
new file mode 100644
index 0000000000..a58d6dfb49
--- /dev/null
+++ b/freebuff/e2e/agent/freebuff-tester.ts
@@ -0,0 +1,52 @@
+import type { AgentDefinition } from '@codebuff/sdk'
+
+/**
+ * Agent definition for testing the Freebuff CLI via tmux.
+ *
+ * This agent is designed to be used with the custom tmux tools from
+ * `createFreebuffTmuxTools()`. It receives a testing task in its prompt
+ * and uses tmux tools to start Freebuff, interact with it, and verify behavior.
+ *
+ * Example usage:
+ * ```ts
+ * const { tools, cleanup } = createFreebuffTmuxTools(binaryPath)
+ * const result = await client.run({
+ *   agent: freebuffTesterAgent.id,
+ *   prompt: 'Start freebuff and verify the welcome screen shows Freebuff branding',
+ *   agentDefinitions: [freebuffTesterAgent],
+ *   customToolDefinitions: tools,
+ *   handleEvent: collector.handleEvent,
+ * })
+ * await cleanup()
+ * ```
+ */
+export const freebuffTesterAgent: AgentDefinition = {
+  id: 'freebuff-tester',
+  displayName: 'Freebuff E2E Tester',
+  model: 'anthropic/claude-sonnet-4.5',
+  toolNames: [
+    'start_freebuff',
+    'send_to_freebuff',
+    'capture_freebuff_output',
+    'stop_freebuff',
+  ],
+  instructionsPrompt: `You are a QA tester for the Freebuff CLI application.
+
+Your job is to verify that Freebuff behaves correctly by interacting with it
+through tmux tools. Follow these steps:
+
+1. Call start_freebuff to launch the CLI
+2. Use capture_freebuff_output (with waitSeconds) to see the terminal output
+3. Use send_to_freebuff to type commands or text
+4. Capture output again to verify behavior
+5. ALWAYS call stop_freebuff when done
+
+Key things to verify:
+- The CLI starts without errors or crashes
+- Branding shows "Freebuff" (not "Codebuff")
+- Commands work as expected
+- Error messages are user-friendly
+
+Report your findings clearly. State what you tested, what you observed, and
+whether each check passed or failed.`,
+}
diff --git a/freebuff/e2e/tests/ads-behavior.e2e.test.ts b/freebuff/e2e/tests/ads-behavior.e2e.test.ts
new file mode 100644
index 0000000000..1ba9fe4d4e
--- /dev/null
+++ b/freebuff/e2e/tests/ads-behavior.e2e.test.ts
@@ -0,0 +1,79 @@
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const TEST_TIMEOUT = 60_000
+
+describe('Freebuff: Ads Behavior', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'ads:enable command is not available',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Type "/ads" to check for ads commands in autocomplete
+      await session.send('/ads', { noEnter: true })
+      const output = await session.capture(2)
+
+      // Neither ads:enable nor ads:disable should appear
+      expect(output).not.toContain('ads:enable')
+      expect(output).not.toContain('ads:disable')
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'ads:disable command is not available',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Try to send the /ads:disable command
+      await session.send('/ads:disable')
+      const output = await session.capture(3)
+
+      // The command should not be recognized
+      // It should NOT show "Ads disabled" confirmation
+      expect(output).not.toMatch(/ads disabled/i)
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'does not show credits earned from ads',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+      const output = await session.capture()
+
+      // In Freebuff, ads don't show "+X credits" because credits don't apply
+      // Check the startup screen doesn't mention ad credits
+      expect(output).not.toMatch(/\+\d+ credits/)
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'does not show "Hide ads" option',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+      const output = await session.capture()
+
+      // In Freebuff, the "Hide ads" link is not shown because ads are mandatory
+      expect(output).not.toContain('Hide ads')
+      // Also should not mention /ads:enable as a way to re-enable
+      expect(output).not.toContain('/ads:enable')
+    },
+    TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/agent-startup.e2e.test.ts b/freebuff/e2e/tests/agent-startup.e2e.test.ts
new file mode 100644
index 0000000000..6d436758a8
--- /dev/null
+++ b/freebuff/e2e/tests/agent-startup.e2e.test.ts
@@ -0,0 +1,122 @@
+/**
+ * Agent-driven E2E test for Freebuff.
+ *
+ * Uses the Codebuff SDK to run a testing agent that interacts with the
+ * Freebuff CLI binary via tmux custom tools. Requires CODEBUFF_API_KEY.
+ *
+ * Set CODEBUFF_API_KEY to run this test, otherwise it will be skipped.
+ */
+
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { freebuffTesterAgent } from '../agent/freebuff-tester'
+import { createFreebuffTmuxTools, requireFreebuffBinary } from '../utils'
+
+import type { CodebuffClient as CodebuffClientType } from '@codebuff/sdk'
+
+const AGENT_TEST_TIMEOUT = 180_000
+
+function getApiKey(): string | null {
+  return process.env.CODEBUFF_API_KEY ?? null
+}
+
+describe('Freebuff: Agent-driven E2E', () => {
+  let cleanup: (() => Promise<void>) | null = null
+
+  afterEach(async () => {
+    if (cleanup) {
+      await cleanup()
+      cleanup = null
+    }
+  })
+
+  test(
+    'agent can start freebuff and verify startup behavior',
+    async () => {
+      const apiKey = getApiKey()
+      if (!apiKey) {
+        console.log(
+          'Skipping agent test: CODEBUFF_API_KEY not set. ' +
+            'Set it to run agent-driven e2e tests.',
+        )
+        return
+      }
+
+      const binary = requireFreebuffBinary()
+      const tmuxTools = createFreebuffTmuxTools(binary)
+      cleanup = tmuxTools.cleanup
+
+      // Dynamically import SDK to avoid build-time dependency issues
+      const { CodebuffClient } = (await import(
+        '@codebuff/sdk'
+      )) as typeof import('@codebuff/sdk')
+
+      const client: CodebuffClientType = new CodebuffClient({ apiKey })
+
+      const events: Array<{ type: string; [key: string]: unknown }> = []
+
+      const result = await client.run({
+        agent: freebuffTesterAgent.id,
+        prompt:
+          'Start Freebuff using the start_freebuff tool. Then capture the output ' +
+          'with capture_freebuff_output (waitSeconds: 3). Verify that:\n' +
+          '1. The CLI started without errors\n' +
+          '2. The output contains "freebuff" (case-insensitive)\n' +
+          '3. The output does NOT contain "codebuff" (case-insensitive)\n' +
+          'Finally, call stop_freebuff to clean up. Report your findings.',
+        agentDefinitions: [freebuffTesterAgent],
+        customToolDefinitions: tmuxTools.tools,
+        handleEvent: (event) => {
+          events.push(event)
+        },
+      })
+
+      expect(result.output.type).not.toBe('error')
+
+      // Verify the agent used the tmux tools
+      const toolCalls = events.filter((e) => e.type === 'tool_call')
+      const toolNames = toolCalls.map((e) => e.toolName)
+      expect(toolNames).toContain('start_freebuff')
+      expect(toolNames).toContain('capture_freebuff_output')
+      expect(toolNames).toContain('stop_freebuff')
+    },
+    AGENT_TEST_TIMEOUT,
+  )
+
+  test(
+    'agent can send commands and verify output',
+    async () => {
+      const apiKey = getApiKey()
+      if (!apiKey) {
+        console.log('Skipping agent test: CODEBUFF_API_KEY not set.')
+        return
+      }
+
+      const binary = requireFreebuffBinary()
+      const tmuxTools = createFreebuffTmuxTools(binary)
+      cleanup = tmuxTools.cleanup
+
+      const { CodebuffClient } = (await import(
+        '@codebuff/sdk'
+      )) as typeof import('@codebuff/sdk')
+
+      const client: CodebuffClientType = new CodebuffClient({ apiKey })
+
+      const result = await client.run({
+        agent: freebuffTesterAgent.id,
+        prompt:
+          'Start Freebuff, wait for it to load (capture with waitSeconds: 5), ' +
+          'then send the "/help" command using send_to_freebuff. ' +
+          'Capture the output after 2 seconds. ' +
+          'Verify the help content is displayed. ' +
+          'Stop Freebuff when done and report your findings.',
+        agentDefinitions: [freebuffTesterAgent],
+        customToolDefinitions: tmuxTools.tools,
+        handleEvent: () => {},
+      })
+
+      expect(result.output.type).not.toBe('error')
+    },
+    AGENT_TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/code-edit.e2e.test.ts b/freebuff/e2e/tests/code-edit.e2e.test.ts
new file mode 100644
index 0000000000..957ccac7f9
--- /dev/null
+++ b/freebuff/e2e/tests/code-edit.e2e.test.ts
@@ -0,0 +1,75 @@
+/**
+ * E2E test that verifies Freebuff can perform a simple code edit.
+ *
+ * Starts Freebuff in tmux, sends a prompt asking it to add a console.log
+ * to a file, and verifies the file was modified correctly.
+ *
+ * Requires CODEBUFF_API_KEY — skipped if not set.
+ */
+
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const TEST_TIMEOUT = 180_000
+
+function getApiKey(): string | null {
+  return process.env.CODEBUFF_API_KEY ?? null
+}
+
+describe('Freebuff: Code Edit', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'adds a console.log to a file',
+    async () => {
+      if (!getApiKey()) {
+        console.log(
+          'Skipping code-edit test: CODEBUFF_API_KEY not set. ' +
+            'Set it to run code-edit e2e tests.',
+        )
+        return
+      }
+
+      const binary = requireFreebuffBinary()
+      const initialContent = [
+        'function greet(name) {',
+        "  return 'Hello, ' + name",
+        '}',
+        '',
+      ].join('\n')
+
+      // Create the file before starting freebuff so it's in the initial context
+      session = await FreebuffSession.start(binary, {
+        waitSeconds: 5,
+        initialFiles: { 'index.js': initialContent },
+      })
+
+      // Verify the file was created
+      expect(session.readFile('index.js')).toBe(initialContent)
+
+      // Send a prompt asking freebuff to add a console.log
+      await session.send("Add a console.log('hello world') to index.js")
+
+      // Wait for the file to be modified with the console.log
+      const finalContent = await session.waitForFileContent(
+        'index.js',
+        'console.log',
+        120_000,
+      )
+
+      expect(finalContent).toContain('console.log')
+      expect(finalContent).toContain('hello world')
+      // The original function should still be present
+      expect(finalContent).toContain('function greet')
+    },
+    TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/help-command.e2e.test.ts b/freebuff/e2e/tests/help-command.e2e.test.ts
new file mode 100644
index 0000000000..173a3425b8
--- /dev/null
+++ b/freebuff/e2e/tests/help-command.e2e.test.ts
@@ -0,0 +1,77 @@
+import { execSync } from 'child_process'
+
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const TEST_TIMEOUT = 60_000
+
+describe('Freebuff: --help flag', () => {
+  test('shows CLI usage information', () => {
+    const binary = requireFreebuffBinary()
+    const output = execSync(`'${binary}' --help`, {
+      encoding: 'utf-8',
+      timeout: 10_000,
+    })
+
+    // Should show the binary name
+    expect(output.toLowerCase()).toContain('freebuff')
+
+    // Should show usage info
+    expect(output).toMatch(/usage|options|commands/i)
+  })
+
+  test('does not reference Codebuff', () => {
+    const binary = requireFreebuffBinary()
+    const output = execSync(`'${binary}' --help`, {
+      encoding: 'utf-8',
+      timeout: 10_000,
+    })
+
+    // The --help output should say Freebuff, not Codebuff
+    expect(output).not.toMatch(/\bcodebuff\b/i)
+  })
+})
+
+describe('Freebuff: /help slash command', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'shows help content when /help is entered',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      await session.send('/help')
+      const output = await session.capture(2)
+
+      // Should show shortcuts section
+      expect(output).toMatch(/shortcut|ctrl|esc/i)
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'does not show subscription commands in help',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      await session.send('/help')
+      const output = await session.capture(2)
+
+      // Freebuff should NOT show these paid/subscription commands
+      expect(output).not.toContain('/subscribe')
+      expect(output).not.toContain('/usage')
+      expect(output).not.toContain('/credits')
+    },
+    TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/slash-commands.e2e.test.ts b/freebuff/e2e/tests/slash-commands.e2e.test.ts
new file mode 100644
index 0000000000..8631a3d4e6
--- /dev/null
+++ b/freebuff/e2e/tests/slash-commands.e2e.test.ts
@@ -0,0 +1,107 @@
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const TEST_TIMEOUT = 60_000
+
+/**
+ * Commands that should be REMOVED in Freebuff.
+ * These are stripped at build time via the FREEBUFF_REMOVED_COMMAND_IDS set
+ * in cli/src/data/slash-commands.ts.
+ */
+const REMOVED_COMMANDS = [
+  '/subscribe',
+  '/usage',
+  '/credits',
+  '/ads:enable',
+  '/ads:disable',
+  '/connect:claude',
+  '/refer-friends',
+  '/agent:gpt-5',
+  '/image',
+  '/publish',
+  '/init',
+]
+
+/**
+ * Commands that should be KEPT in Freebuff.
+ * Only includes commands reliably visible in the initial autocomplete viewport.
+ * Commands like /logout and /exit exist but may be scrolled off-screen.
+ */
+const KEPT_COMMANDS = [
+  '/help',
+  '/new',
+  '/history',
+  '/feedback',
+  '/bash',
+  '/theme:toggle',
+]
+
+describe('Freebuff: Slash Commands', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'slash command menu does not show removed commands',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Type "/" to trigger the slash command autocomplete menu
+      await session.send('/', { noEnter: true })
+      const output = await session.capture(2)
+
+      // Removed commands should NOT appear in the autocomplete menu
+      for (const cmd of REMOVED_COMMANDS) {
+        // Strip the leading slash for matching since the menu shows command ids
+        const cmdId = cmd.slice(1)
+        expect(output).not.toContain(cmdId)
+      }
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'slash command menu shows kept commands',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Type "/" to trigger the slash command autocomplete menu
+      await session.send('/', { noEnter: true })
+      const output = await session.capture(2)
+
+      // Kept commands SHOULD appear in the autocomplete menu
+      for (const cmd of KEPT_COMMANDS) {
+        const cmdId = cmd.slice(1)
+        expect(output).toContain(cmdId)
+      }
+    },
+    TEST_TIMEOUT,
+  )
+
+  test(
+    'no mode-related slash commands are visible',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Type "/mode" to check for mode commands
+      await session.send('/mode', { noEnter: true })
+      const output = await session.capture(2)
+
+      // Mode commands should not exist in Freebuff
+      expect(output).not.toContain('mode:max')
+      expect(output).not.toContain('mode:default')
+      expect(output).not.toContain('mode:lite')
+      expect(output).not.toContain('mode:free')
+    },
+    TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/startup.e2e.test.ts b/freebuff/e2e/tests/startup.e2e.test.ts
new file mode 100644
index 0000000000..173520bfaa
--- /dev/null
+++ b/freebuff/e2e/tests/startup.e2e.test.ts
@@ -0,0 +1,71 @@
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const STARTUP_TIMEOUT = 60_000
+
+describe('Freebuff: Startup', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'binary starts without crashing',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary)
+      const output = await session.capture(3)
+
+      // Should not contain fatal errors
+      expect(output).not.toContain('FATAL')
+      expect(output).not.toContain('panic')
+      expect(output).not.toContain('Segmentation fault')
+
+      // Should have some visible output (not a blank screen)
+      const nonEmptyLines = output
+        .split('\n')
+        .filter((line) => line.trim().length > 0)
+      expect(nonEmptyLines.length).toBeGreaterThan(0)
+    },
+    STARTUP_TIMEOUT,
+  )
+
+  test(
+    'shows Freebuff branding',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary)
+      const output = await session.capture(3)
+
+      // The CLI should identify itself as Freebuff, not Codebuff
+      const lowerOutput = output.toLowerCase()
+      expect(lowerOutput).toContain('freebuff')
+    },
+    STARTUP_TIMEOUT,
+  )
+
+  test(
+    'responds to Ctrl+C gracefully',
+    async () => {
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary)
+
+      // Wait for startup, then send Ctrl+C
+      await session.capture(2)
+      await session.sendKey('C-c')
+
+      // Give it a moment to process
+      const output = await session.capture(1)
+
+      // Should not show an unhandled error
+      expect(output).not.toContain('Unhandled')
+      expect(output).not.toContain('FATAL')
+    },
+    STARTUP_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/terminal-command.e2e.test.ts b/freebuff/e2e/tests/terminal-command.e2e.test.ts
new file mode 100644
index 0000000000..9c3486d1ed
--- /dev/null
+++ b/freebuff/e2e/tests/terminal-command.e2e.test.ts
@@ -0,0 +1,68 @@
+/**
+ * E2E test that verifies Freebuff can run terminal commands.
+ *
+ * Starts Freebuff in tmux, sends a prompt asking it to run a shell command,
+ * and verifies the command was executed by checking its side effects.
+ *
+ * Requires CODEBUFF_API_KEY — skipped if not set.
+ */
+
+import { afterEach, describe, expect, test } from 'bun:test'
+
+import { FreebuffSession, requireFreebuffBinary } from '../utils'
+
+const TEST_TIMEOUT = 180_000
+
+function getApiKey(): string | null {
+  return process.env.CODEBUFF_API_KEY ?? null
+}
+
+describe('Freebuff: Terminal Command', () => {
+  let session: FreebuffSession | null = null
+
+  afterEach(async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  })
+
+  test(
+    'runs a terminal command that creates a file',
+    async () => {
+      if (!getApiKey()) {
+        console.log(
+          'Skipping terminal-command test: CODEBUFF_API_KEY not set. ' +
+            'Set it to run terminal-command e2e tests.',
+        )
+        return
+      }
+
+      const binary = requireFreebuffBinary()
+      session = await FreebuffSession.start(binary, { waitSeconds: 5 })
+
+      // Ask freebuff to run a shell command whose output can only come from
+      // actual terminal execution (not file-writing tools)
+      await session.send(
+        'Use the terminal to run: date +%s > timestamp.txt && echo done',
+      )
+
+      // Wait for the file to be created by the terminal command
+      const content = await session.waitForFileContent(
+        'timestamp.txt',
+        '',
+        120_000,
+      )
+
+      // The file should contain a Unix timestamp (numeric string)
+      const trimmed = content.trim()
+      expect(trimmed).toMatch(/^\d{10,}$/)
+
+      // Verify the timestamp is recent (within the last 5 minutes)
+      const timestamp = parseInt(trimmed, 10)
+      const now = Math.floor(Date.now() / 1000)
+      expect(Math.abs(now - timestamp)).toBeLessThan(300)
+    },
+    TEST_TIMEOUT,
+  )
+})
diff --git a/freebuff/e2e/tests/version.e2e.test.ts b/freebuff/e2e/tests/version.e2e.test.ts
new file mode 100644
index 0000000000..d204bd684e
--- /dev/null
+++ b/freebuff/e2e/tests/version.e2e.test.ts
@@ -0,0 +1,24 @@
+import { execSync } from 'child_process'
+
+import { describe, expect, test } from 'bun:test'
+
+import { requireFreebuffBinary } from '../utils'
+
+describe('Freebuff: --version', () => {
+  test('outputs a version string', () => {
+    const binary = requireFreebuffBinary()
+    const output = execSync(`'${binary}' --version`, {
+      encoding: 'utf-8',
+      timeout: 10_000,
+    }).trim()
+
+    // Should contain a semver-like version (e.g. "0.0.15" or "1.0.0")
+    expect(output).toMatch(/\d+\.\d+\.\d+/)
+  })
+
+  test('exits with code 0', () => {
+    const binary = requireFreebuffBinary()
+    // execSync throws on non-zero exit codes, so if this doesn't throw, it exited 0
+    execSync(`'${binary}' --version`, { encoding: 'utf-8', timeout: 10_000 })
+  })
+})
diff --git a/freebuff/e2e/utils/binary-helpers.ts b/freebuff/e2e/utils/binary-helpers.ts
new file mode 100644
index 0000000000..c233574dd4
--- /dev/null
+++ b/freebuff/e2e/utils/binary-helpers.ts
@@ -0,0 +1,24 @@
+import { existsSync } from 'fs'
+import { dirname, resolve } from 'path'
+import { fileURLToPath } from 'url'
+
+const __dirname = dirname(fileURLToPath(import.meta.url))
+export const REPO_ROOT = resolve(__dirname, '../../..')
+
+export function getFreebuffBinaryPath(): string {
+  if (process.env.FREEBUFF_BINARY) {
+    return resolve(process.env.FREEBUFF_BINARY)
+  }
+  return resolve(REPO_ROOT, 'cli/bin/freebuff')
+}
+
+export function requireFreebuffBinary(): string {
+  const binaryPath = getFreebuffBinaryPath()
+  if (!existsSync(binaryPath)) {
+    throw new Error(
+      `Freebuff binary not found at ${binaryPath}. ` +
+        'Build with: bun freebuff/cli/build.ts <version>',
+    )
+  }
+  return binaryPath
+}
diff --git a/freebuff/e2e/utils/freebuff-session.ts b/freebuff/e2e/utils/freebuff-session.ts
new file mode 100644
index 0000000000..5521534434
--- /dev/null
+++ b/freebuff/e2e/utils/freebuff-session.ts
@@ -0,0 +1,162 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { tmuxCapture, tmuxSend, tmuxSendKey, tmuxStart, tmuxStop } from './tmux-helpers'
+
+export class FreebuffSession {
+  public readonly name: string
+  public readonly workDir: string
+
+  private constructor(sessionName: string, workDir: string) {
+    this.name = sessionName
+    this.workDir = workDir
+  }
+
+  /**
+   * Start a freebuff binary in a tmux session.
+   * Creates a temporary working directory to simulate a real user project.
+   */
+  static async start(
+    binaryPath: string,
+    options?: {
+      waitSeconds?: number
+      width?: number
+      height?: number
+      initialFiles?: Record<string, string>
+    },
+  ): Promise<FreebuffSession> {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'freebuff-e2e-'))
+
+    // Create a minimal project so freebuff has something to work with
+    fs.writeFileSync(
+      path.join(tmpDir, 'README.md'),
+      '# E2E Test Project\n',
+      'utf-8',
+    )
+
+    // Write any initial files before starting the binary
+    if (options?.initialFiles) {
+      for (const [relativePath, content] of Object.entries(options.initialFiles)) {
+        const filePath = path.join(tmpDir, relativePath)
+        const dir = path.dirname(filePath)
+        if (!fs.existsSync(dir)) {
+          fs.mkdirSync(dir, { recursive: true })
+        }
+        fs.writeFileSync(filePath, content, 'utf-8')
+      }
+    }
+
+    const command = `cd '${tmpDir}' && '${binaryPath}'`
+    const sessionName = tmuxStart({
+      command,
+      waitSeconds: options?.waitSeconds ?? 4,
+      width: options?.width ?? 120,
+      height: options?.height ?? 30,
+    })
+
+    return new FreebuffSession(sessionName, tmpDir)
+  }
+
+  /** Write a file into the session's working directory. */
+  writeFile(relativePath: string, content: string): void {
+    const filePath = path.join(this.workDir, relativePath)
+    const dir = path.dirname(filePath)
+    if (!fs.existsSync(dir)) {
+      fs.mkdirSync(dir, { recursive: true })
+    }
+    fs.writeFileSync(filePath, content, 'utf-8')
+  }
+
+  /** Read a file from the session's working directory. */
+  readFile(relativePath: string): string {
+    return fs.readFileSync(path.join(this.workDir, relativePath), 'utf-8')
+  }
+
+  /** Check if a file exists in the session's working directory. */
+  fileExists(relativePath: string): boolean {
+    return fs.existsSync(path.join(this.workDir, relativePath))
+  }
+
+  /**
+   * Poll until a file in the working directory contains the given text.
+   * Throws if the timeout is exceeded.
+   */
+  async waitForFileContent(
+    relativePath: string,
+    pattern: string,
+    timeoutMs = 60_000,
+  ): Promise<string> {
+    const start = Date.now()
+    while (Date.now() - start < timeoutMs) {
+      try {
+        const content = this.readFile(relativePath)
+        if (content.includes(pattern)) return content
+      } catch {
+        // File may not exist yet
+      }
+      await new Promise((resolve) => setTimeout(resolve, 1_000))
+    }
+    let finalContent = '(file does not exist)'
+    try {
+      finalContent = this.readFile(relativePath)
+    } catch {
+      // ignore
+    }
+    throw new Error(
+      `Timed out after ${timeoutMs}ms waiting for "${pattern}" in ${relativePath}.\n` +
+        `Last content:\n${finalContent}`,
+    )
+  }
+
+  /** Send text input to the freebuff CLI (presses Enter by default). */
+  async send(
+    text: string,
+    options?: { noEnter?: boolean; waitIdle?: number },
+  ): Promise<void> {
+    tmuxSend(this.name, text, { ...options, force: true })
+  }
+
+  /** Send a special key (e.g. Escape, C-c, Enter). */
+  async sendKey(key: string): Promise<void> {
+    tmuxSendKey(this.name, key)
+  }
+
+  /** Capture current terminal output, optionally waiting first. */
+  async capture(waitSeconds?: number): Promise<string> {
+    return tmuxCapture(this.name, { waitSeconds, noSave: true })
+  }
+
+  /** Capture and auto-save to the session logs directory with a label. */
+  async captureLabeled(label: string, waitSeconds?: number): Promise<string> {
+    return tmuxCapture(this.name, { waitSeconds, label })
+  }
+
+  /**
+   * Poll until the terminal output contains the given text.
+   * Throws if the timeout is exceeded.
+   */
+  async waitForText(pattern: string, timeoutMs = 30_000): Promise<string> {
+    const start = Date.now()
+    while (Date.now() - start < timeoutMs) {
+      const output = await this.capture()
+      if (output.includes(pattern)) return output
+      await new Promise((resolve) => setTimeout(resolve, 500))
+    }
+    const finalOutput = await this.capture()
+    throw new Error(
+      `Timed out after ${timeoutMs}ms waiting for "${pattern}".\n` +
+        `Last output:\n${finalOutput}`,
+    )
+  }
+
+  /** Stop the tmux session and clean up the temp directory. */
+  async stop(): Promise<void> {
+    tmuxStop(this.name)
+    try {
+      fs.rmSync(this.workDir, { recursive: true, force: true })
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
diff --git a/freebuff/e2e/utils/index.ts b/freebuff/e2e/utils/index.ts
new file mode 100644
index 0000000000..6927a4abd4
--- /dev/null
+++ b/freebuff/e2e/utils/index.ts
@@ -0,0 +1,10 @@
+export { getFreebuffBinaryPath, requireFreebuffBinary, REPO_ROOT } from './binary-helpers'
+export { FreebuffSession } from './freebuff-session'
+export { createFreebuffTmuxTools } from './tmux-custom-tools'
+export {
+  tmuxStart,
+  tmuxSend,
+  tmuxSendKey,
+  tmuxCapture,
+  tmuxStop,
+} from './tmux-helpers'
diff --git a/freebuff/e2e/utils/tmux-custom-tools.ts b/freebuff/e2e/utils/tmux-custom-tools.ts
new file mode 100644
index 0000000000..92af618934
--- /dev/null
+++ b/freebuff/e2e/utils/tmux-custom-tools.ts
@@ -0,0 +1,155 @@
+import { z } from 'zod/v4'
+
+import { FreebuffSession } from './freebuff-session'
+
+import type { ZodType } from 'zod/v4'
+
+interface FreebuffToolDefinition {
+  toolName: string
+  description: string
+  inputSchema: ZodType
+  endsAgentStep: boolean
+  exampleInputs: Record<string, unknown>[]
+  execute: (input: Record<string, unknown>) => Promise<ToolOutput>
+}
+
+type ToolOutput = { type: 'json'; value: Record<string, unknown> }[]
+
+/**
+ * Creates custom tool definitions that allow a Codebuff SDK agent
+ * to interact with a Freebuff CLI binary via tmux.
+ *
+ * Returns the tools array and a cleanup function to call in afterEach.
+ *
+ * Usage:
+ * ```ts
+ * const { tools, cleanup } = createFreebuffTmuxTools(binaryPath)
+ * // ... pass tools to client.run({ customToolDefinitions: tools })
+ * // ... in afterEach: await cleanup()
+ * ```
+ */
+export function createFreebuffTmuxTools(binaryPath: string): {
+  tools: FreebuffToolDefinition[]
+  cleanup: () => Promise<void>
+} {
+  let session: FreebuffSession | null = null
+
+  const startTool: FreebuffToolDefinition = {
+    toolName: 'start_freebuff',
+    description:
+      'Start the Freebuff CLI binary in a tmux terminal session. Call this first before interacting with Freebuff.',
+    inputSchema: z.object({}),
+    endsAgentStep: true,
+    exampleInputs: [{}],
+    execute: async (): Promise<ToolOutput> => {
+      if (session) {
+        return [
+          {
+            type: 'json',
+            value: {
+              error: 'Session already running',
+              sessionName: session.name,
+            },
+          },
+        ]
+      }
+      session = await FreebuffSession.start(binaryPath)
+      const initialOutput = await session.capture(2)
+      return [
+        {
+          type: 'json',
+          value: {
+            started: true,
+            sessionName: session.name,
+            initialOutput,
+          },
+        },
+      ]
+    },
+  }
+
+  const sendInputTool: FreebuffToolDefinition = {
+    toolName: 'send_to_freebuff',
+    description:
+      'Send text input to the running Freebuff CLI. The text is sent as if typed by the user and Enter is pressed.',
+    inputSchema: z.object({
+      text: z.string().describe('Text to send to Freebuff'),
+    }),
+    endsAgentStep: false,
+    exampleInputs: [{ text: '/help' }],
+    execute: async (input): Promise<ToolOutput> => {
+      const text = (input as { text: string }).text
+      if (!session) {
+        return [
+          {
+            type: 'json',
+            value: { error: 'No session running. Call start_freebuff first.' },
+          },
+        ]
+      }
+      await session.send(text)
+      return [{ type: 'json', value: { sent: true, text } }]
+    },
+  }
+
+  const captureOutputTool: FreebuffToolDefinition = {
+    toolName: 'capture_freebuff_output',
+    description:
+      'Capture the current terminal output from the running Freebuff CLI session. ' +
+      'Use waitSeconds to wait before capturing (useful after sending a command).',
+    inputSchema: z.object({
+      waitSeconds: z
+        .number()
+        .optional()
+        .describe('Seconds to wait before capturing (default: 0)'),
+    }),
+    endsAgentStep: true,
+    exampleInputs: [{ waitSeconds: 2 }],
+    execute: async (input): Promise<ToolOutput> => {
+      const waitSeconds = (input as { waitSeconds?: number }).waitSeconds
+      if (!session) {
+        return [
+          {
+            type: 'json',
+            value: { error: 'No session running. Call start_freebuff first.' },
+          },
+        ]
+      }
+      const output = await session.capture(waitSeconds)
+      return [{ type: 'json', value: { output } }]
+    },
+  }
+
+  const stopTool: FreebuffToolDefinition = {
+    toolName: 'stop_freebuff',
+    description:
+      'Stop the running Freebuff CLI session and clean up resources. Always call this when done testing.',
+    inputSchema: z.object({}),
+    endsAgentStep: true,
+    exampleInputs: [{}],
+    execute: async (): Promise<ToolOutput> => {
+      if (!session) {
+        return [
+          { type: 'json', value: { stopped: true, wasRunning: false } },
+        ]
+      }
+      await session.stop()
+      session = null
+      return [
+        { type: 'json', value: { stopped: true, wasRunning: true } },
+      ]
+    },
+  }
+
+  const cleanup = async () => {
+    if (session) {
+      await session.stop()
+      session = null
+    }
+  }
+
+  return {
+    tools: [startTool, sendInputTool, captureOutputTool, stopTool],
+    cleanup,
+  }
+}
diff --git a/freebuff/e2e/utils/tmux-helpers.ts b/freebuff/e2e/utils/tmux-helpers.ts
new file mode 100644
index 0000000000..40999a3360
--- /dev/null
+++ b/freebuff/e2e/utils/tmux-helpers.ts
@@ -0,0 +1,83 @@
+import { execFileSync } from 'child_process'
+
+import { REPO_ROOT } from './binary-helpers'
+
+const SCRIPTS_DIR = `${REPO_ROOT}/scripts/tmux`
+
+const EXEC_OPTIONS = { encoding: 'utf-8' as const, cwd: REPO_ROOT }
+
+export interface TmuxStartOptions {
+  command: string
+  name?: string
+  width?: number
+  height?: number
+  waitSeconds?: number
+}
+
+export function tmuxStart(options: TmuxStartOptions): string {
+  const args: string[] = [
+    `${SCRIPTS_DIR}/tmux-start.sh`,
+    '--command',
+    options.command,
+    '--plain',
+  ]
+  if (options.name) args.push('--name', options.name)
+  if (options.width) args.push('--width', String(options.width))
+  if (options.height) args.push('--height', String(options.height))
+  if (options.waitSeconds !== undefined)
+    args.push('--wait', String(options.waitSeconds))
+
+  return execFileSync('bash', args, EXEC_OPTIONS).trim()
+}
+
+export function tmuxSend(
+  sessionName: string,
+  text: string,
+  options?: { noEnter?: boolean; waitIdle?: number; force?: boolean },
+): void {
+  const args: string[] = [
+    `${SCRIPTS_DIR}/tmux-send.sh`,
+    sessionName,
+    text,
+  ]
+  if (options?.noEnter) args.push('--no-enter')
+  if (options?.waitIdle) args.push('--wait-idle', String(options.waitIdle))
+  if (options?.force) args.push('--force')
+
+  execFileSync('bash', args, EXEC_OPTIONS)
+}
+
+export function tmuxSendKey(sessionName: string, key: string): void {
+  execFileSync(
+    'bash',
+    [`${SCRIPTS_DIR}/tmux-send.sh`, sessionName, '--key', key],
+    EXEC_OPTIONS,
+  )
+}
+
+export function tmuxCapture(
+  sessionName: string,
+  options?: { waitSeconds?: number; label?: string; noSave?: boolean },
+): string {
+  const args: string[] = [`${SCRIPTS_DIR}/tmux-capture.sh`, sessionName]
+  if (options?.waitSeconds) args.push('--wait', String(options.waitSeconds))
+  if (options?.label) args.push('--label', options.label)
+  if (options?.noSave) args.push('--no-save')
+
+  return execFileSync('bash', args, {
+    ...EXEC_OPTIONS,
+    stdio: ['pipe', 'pipe', 'pipe'],
+  })
+}
+
+export function tmuxStop(sessionName: string): void {
+  try {
+    execFileSync(
+      'bash',
+      [`${SCRIPTS_DIR}/tmux-stop.sh`, sessionName],
+      EXEC_OPTIONS,
+    )
+  } catch {
+    // tmux-stop.sh is idempotent; ignore errors if session already gone
+  }
+}
diff --git a/freebuff/package.json b/freebuff/package.json
index 286a863793..03fb9d35e4 100644
--- a/freebuff/package.json
+++ b/freebuff/package.json
@@ -3,6 +3,17 @@
   "version": "1.0.0",
   "private": true,
   "scripts": {
-    "release": "bun cli/release.ts"
+    "release": "bun cli/release.ts",
+    "build:binary": "bun cli/build.ts 0.0.0-dev",
+    "e2e": "bun test e2e/tests/",
+    "e2e:version": "bun test e2e/tests/version.e2e.test.ts",
+    "e2e:startup": "bun test e2e/tests/startup.e2e.test.ts",
+    "e2e:help": "bun test e2e/tests/help-command.e2e.test.ts",
+    "e2e:slash-commands": "bun test e2e/tests/slash-commands.e2e.test.ts",
+    "e2e:mode": "bun test e2e/tests/mode-restriction.e2e.test.ts",
+    "e2e:ads": "bun test e2e/tests/ads-behavior.e2e.test.ts",
+    "e2e:agent": "bun test e2e/tests/agent-startup.e2e.test.ts",
+    "e2e:code-edit": "bun test e2e/tests/code-edit.e2e.test.ts",
+    "e2e:terminal-command": "bun test e2e/tests/terminal-command.e2e.test.ts"
   }
 }
diff --git a/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts b/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts
index 3f0ab73d4a..63ddf60d24 100644
--- a/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts
+++ b/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts
@@ -20,6 +20,7 @@ import {
   mock,
   spyOn,
 } from 'bun:test'
+import { APICallError } from 'ai'
 import { z } from 'zod/v4'
 
 import { loopAgentSteps } from '../run-agent-step'
@@ -931,4 +932,89 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       expect(llmCallCount).toBe(0)
     })
   })
+
+  describe('API error handling', () => {
+    it('should propagate error code and server message from 403 APICallError responseBody', async () => {
+      const llmOnlyTemplate = {
+        ...mockTemplate,
+        handleSteps: undefined,
+      }
+
+      const localAgentTemplates = {
+        'test-agent': llmOnlyTemplate,
+      }
+
+      // Mock promptAiSdkStream to throw an APICallError with a 403 status
+      // and a responseBody containing the server's structured error
+      loopAgentStepsBaseParams.promptAiSdkStream = async function* () {
+        throw new APICallError({
+          statusCode: 403,
+          message: 'Forbidden',
+          url: 'https://api.codebuff.com/v1/chat/completions',
+          requestBodyValues: {},
+          responseBody: JSON.stringify({
+            error: 'free_mode_unavailable',
+            message: 'Free mode is not available in your country.',
+          }),
+          isRetryable: false,
+        })
+      }
+
+      const result = await loopAgentSteps({
+        ...loopAgentStepsBaseParams,
+        agentType: 'test-agent',
+        localAgentTemplates,
+      })
+
+      expect(result.output.type).toBe('error')
+      if (result.output.type === 'error') {
+        // Should use the server's message, NOT the generic "Forbidden"
+        expect(result.output.message).toBe('Free mode is not available in your country.')
+        // Should NOT have the 'Agent run error: ' prefix since message came from responseBody
+        expect(result.output.message).not.toContain('Agent run error:')
+        // Should propagate the error code so the CLI can match on it
+        expect(result.output.error).toBe('free_mode_unavailable')
+        // Should propagate the status code
+        expect(result.output.statusCode).toBe(403)
+      }
+    })
+
+    it('should prefix with "Agent run error:" when responseBody has no parseable message', async () => {
+      const llmOnlyTemplate = {
+        ...mockTemplate,
+        handleSteps: undefined,
+      }
+
+      const localAgentTemplates = {
+        'test-agent': llmOnlyTemplate,
+      }
+
+      // APICallError with no responseBody
+      loopAgentStepsBaseParams.promptAiSdkStream = async function* () {
+        throw new APICallError({
+          statusCode: 500,
+          message: 'Internal Server Error',
+          url: 'https://api.codebuff.com/v1/chat/completions',
+          requestBodyValues: {},
+          responseBody: undefined,
+          isRetryable: true,
+        })
+      }
+
+      const result = await loopAgentSteps({
+        ...loopAgentStepsBaseParams,
+        agentType: 'test-agent',
+        localAgentTemplates,
+      })
+
+      expect(result.output.type).toBe('error')
+      if (result.output.type === 'error') {
+        // Should have the prefix since there's no server message
+        expect(result.output.message).toContain('Agent run error:')
+        expect(result.output.message).toContain('Internal Server Error')
+        // No error code since responseBody wasn't parseable
+        expect(result.output.error).toBeUndefined()
+      }
+    })
+  })
 })
diff --git a/packages/agent-runtime/src/run-agent-step.ts b/packages/agent-runtime/src/run-agent-step.ts
index b323d5f0f5..992db72aa7 100644
--- a/packages/agent-runtime/src/run-agent-step.ts
+++ b/packages/agent-runtime/src/run-agent-step.ts
@@ -2,7 +2,7 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events'
 import { supportsCacheControl } from '@codebuff/common/old-constants'
 import { TOOLS_WHICH_WONT_FORCE_NEXT_STEP } from '@codebuff/common/tools/constants'
 import { buildArray } from '@codebuff/common/util/array'
-import { AbortError, getErrorObject, isAbortError } from '@codebuff/common/util/error'
+import { AbortError, getErrorObject, isAbortError, parseApiErrorResponseBody } from '@codebuff/common/util/error'
 import { serializeCacheDebugCorrelation } from '@codebuff/common/util/cache-debug'
 import { systemMessage, userMessage } from '@codebuff/common/util/messages'
 import { APICallError, type ToolSet } from 'ai'
@@ -1069,8 +1069,16 @@ export async function loopAgentSteps(
     )
 
     let errorMessage = ''
+    let errorCode: string | undefined
+    let hasServerMessage = false
     if (error instanceof APICallError) {
       errorMessage = `${error.message}`
+      const parsed = parseApiErrorResponseBody(error.responseBody)
+      if (parsed.errorCode) errorCode = parsed.errorCode
+      if (parsed.message) {
+        errorMessage = parsed.message
+        hasServerMessage = true
+      }
     } else {
       // Extract clean error message (just the message, not name:message format)
       errorMessage =
@@ -1101,8 +1109,9 @@ export async function loopAgentSteps(
       agentState: currentAgentState,
       output: {
         type: 'error',
-        message: 'Agent run error: ' + errorMessage,
+        message: hasServerMessage ? errorMessage : 'Agent run error: ' + errorMessage,
         ...(statusCode !== undefined && { statusCode }),
+        ...(errorCode !== undefined && { error: errorCode }),
       },
     }
   }
diff --git a/sdk/src/run.ts b/sdk/src/run.ts
index 13b6562624..f0d150ca01 100644
--- a/sdk/src/run.ts
+++ b/sdk/src/run.ts
@@ -15,6 +15,7 @@ import {
 import { toolNames } from '@codebuff/common/tools/constants'
 import { clientToolCallSchema } from '@codebuff/common/tools/list'
 import { AgentOutputSchema } from '@codebuff/common/types/session-state'
+import { parseApiErrorResponseBody } from '@codebuff/common/util/error'
 import { cloneDeep } from 'lodash'
 
 import { getErrorStatusCode } from './error-utils'
@@ -516,25 +517,13 @@ async function runOnce({
 
     // Extract structured error details from the API response body
     // (e.g., AI SDK's AI_APICallError includes a responseBody with the server's JSON response)
-    let errorCode: string | undefined
     const responseBody =
       error && typeof error === 'object' && 'responseBody' in error
         ? (error as { responseBody: unknown }).responseBody
         : undefined
-    if (typeof responseBody === 'string') {
-      try {
-        const parsed: unknown = JSON.parse(responseBody)
-        if (parsed && typeof parsed === 'object') {
-          if ('error' in parsed && typeof (parsed as { error: unknown }).error === 'string') {
-            errorCode = (parsed as { error: string }).error
-          }
-          if ('message' in parsed && typeof (parsed as { message: unknown }).message === 'string') {
-            errorMessage = (parsed as { message: string }).message
-          }
-        }
-      } catch {
-        // responseBody wasn't valid JSON; keep original errorMessage
-      }
+    const { errorCode, message: parsedMessage } = parseApiErrorResponseBody(responseBody)
+    if (parsedMessage) {
+      errorMessage = parsedMessage
     }
 
     resolve({