diff --git a/.github/workflows/freebuff-e2e.yml b/.github/workflows/freebuff-e2e.yml new file mode 100644 index 0000000000..acf0a63e3f --- /dev/null +++ b/.github/workflows/freebuff-e2e.yml @@ -0,0 +1,111 @@ +name: Freebuff E2E Tests + +on: + push: + branches: ['main'] + pull_request: + branches: ['main'] + workflow_dispatch: # Manual trigger + +concurrency: + group: freebuff-e2e-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-freebuff: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - uses: ./.github/actions/setup-project + + - name: Set environment variables + env: + SECRETS_CONTEXT: ${{ toJSON(secrets) }} + run: | + VAR_NAMES=$(bun scripts/generate-ci-env.ts --scope client) + echo "$SECRETS_CONTEXT" | jq -r --argjson vars "$VAR_NAMES" ' + to_entries | .[] | select(.key as $k | $vars | index($k)) | .key + "=" + .value + ' >> $GITHUB_ENV + echo "FREEBUFF_MODE=true" >> $GITHUB_ENV + echo "NEXT_PUBLIC_CB_ENVIRONMENT=prod" >> $GITHUB_ENV + echo "CODEBUFF_GITHUB_ACTIONS=true" >> $GITHUB_ENV + + - name: Build Freebuff binary + run: bun freebuff/cli/build.ts 0.0.0-e2e + + - name: Smoke test binary + run: | + chmod +x cli/bin/freebuff + cli/bin/freebuff --version + + - name: Upload binary + uses: actions/upload-artifact@v7 + with: + name: freebuff-binary + path: cli/bin/freebuff + retention-days: 1 + + e2e: + needs: build-freebuff + runs-on: ubuntu-latest + timeout-minutes: 10 + strategy: + fail-fast: false + matrix: + test: + - version + - startup + - help-command + - slash-commands + - ads-behavior + - agent-startup + - code-edit + - terminal-command + name: e2e-${{ matrix.test }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - uses: ./.github/actions/setup-project + + - name: Install tmux + run: sudo apt-get update && sudo apt-get install -y tmux + + - name: Download Freebuff binary + uses: actions/download-artifact@v4 + with: + name: freebuff-binary + path: cli/bin/ + + - name: Make binary executable + run: chmod +x cli/bin/freebuff + + - name: Set environment variables + env: + SECRETS_CONTEXT: ${{ toJSON(secrets) }} + run: | + VAR_NAMES=$(bun scripts/generate-ci-env.ts) + echo "$SECRETS_CONTEXT" | jq -r --argjson vars "$VAR_NAMES" ' + to_entries | .[] | select(.key as $k | $vars | index($k)) | .key + "=" + .value + ' >> $GITHUB_ENV + echo "CODEBUFF_GITHUB_ACTIONS=true" >> $GITHUB_ENV + echo "NEXT_PUBLIC_CB_ENVIRONMENT=test" >> $GITHUB_ENV + echo "CODEBUFF_GITHUB_TOKEN=${{ secrets.CODEBUFF_GITHUB_TOKEN }}" >> $GITHUB_ENV + echo "CODEBUFF_API_KEY=${{ secrets.CODEBUFF_API_KEY }}" >> $GITHUB_ENV + + - name: Build SDK + run: cd sdk && bun run build + + - name: Run e2e test - ${{ matrix.test }} + run: bun test freebuff/e2e/tests/${{ matrix.test }}.e2e.test.ts --timeout=120000 + + - name: Upload tmux session logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: tmux-logs-${{ matrix.test }} + path: debug/tmux-sessions/ + retention-days: 7 diff --git a/cli/release/package.json b/cli/release/package.json index 6da3d70989..f51779ae8b 100644 --- a/cli/release/package.json +++ b/cli/release/package.json @@ -1,6 +1,6 @@ { "name": "codebuff", - "version": "1.0.630", + "version": "1.0.631", "description": "AI coding agent", "license": "MIT", "bin": { diff --git a/common/src/types/session-state.ts b/common/src/types/session-state.ts index f4ac626747..3896f87886 100644 --- a/common/src/types/session-state.ts +++ b/common/src/types/session-state.ts @@ -68,6 +68,7 @@ export const AgentOutputSchema = z.discriminatedUnion('type', [ type: z.literal('error'), message: z.string(), statusCode: z.number().optional(), + error: z.string().optional(), }), ]) export type AgentOutput = z.infer diff --git a/common/src/util/error.ts b/common/src/util/error.ts index 188df1ca9c..1861e1d399 100644 --- a/common/src/util/error.ts +++ b/common/src/util/error.ts @@ -187,6 +187,35 @@ export function unwrapPromptResult(result: PromptResult): T { return result.value } +/** + * Parses a JSON response body string from an API error to extract structured error details. + * Used to extract machine-readable error codes and human-readable messages from API responses + * (e.g., AI SDK's APICallError includes a responseBody with the server's JSON response). + * + * Returns extracted fields, or an empty object if the responseBody is not a valid JSON string + * with the expected shape. + */ +export function parseApiErrorResponseBody(responseBody: unknown): { + errorCode?: string + message?: string +} { + if (typeof responseBody !== 'string') return {} + try { + const parsed: unknown = JSON.parse(responseBody) + if (!parsed || typeof parsed !== 'object') return {} + const result: { errorCode?: string; message?: string } = {} + if ('error' in parsed && typeof (parsed as { error: unknown }).error === 'string') { + result.errorCode = (parsed as { error: string }).error + } + if ('message' in parsed && typeof (parsed as { message: unknown }).message === 'string') { + result.message = (parsed as { message: string }).message + } + return result + } catch { + return {} + } +} + // Extended error properties that various libraries add to Error objects interface ExtendedErrorProperties { status?: number diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000000..7e2adb3e89 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,244 @@ +# Architecture Overview + +Codebuff is a TypeScript monorepo (Bun workspaces) that provides an AI-powered coding assistant via a CLI, SDK, and web API. + +## Package Dependency Graph + +``` + ┌──────────┐ + │ cli/ │ TUI client (OpenTUI + React) + └────┬─────┘ + │ + ┌────▼─────┐ + ┌───────│ sdk/ │ JS/TS SDK + │ └────┬─────┘ + │ │ + ┌───────▼────────┐ │ + │ agent-runtime/ │◄──┘ Agent execution engine + └───────┬────────┘ + │ + ┌───────────────┼───────────────┐ + │ │ │ + ┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐ + │ agents/ │ │ common/ │ │ internal/ │ + └───────────┘ └─────┬─────┘ └─────┬─────┘ + │ │ + ┌─────┼─────┐ ┌─────┼─────────┐ + │ │ │ │ │ │ + billing/ bigquery/ code-map/ web/ +``` + +## Packages + +### `cli/` — TUI Client + +The user-facing terminal UI, built with [OpenTUI](https://github.com/nickhudkins/opentui) (a React renderer for terminals) and React hooks. + +- **Entry point:** `src/index.tsx` → `src/app.tsx` → `src/chat.tsx` +- **Key responsibilities:** + - Renders the chat interface, agent output, tool call results, and status indicators + - Manages user input, slash commands (`/help`, `/usage`), and agent mode selection (DEFAULT, MAX, PLAN) + - Handles authentication (login polling, OAuth), session persistence, and chat history + - Calls `client.run()` from the SDK and processes streaming events +- **Depends on:** `sdk`, `common` + +### `sdk/` — JavaScript/TypeScript SDK + +The public SDK used by the CLI and available to external users via `@codebuff/sdk` on npm. + +- **Entry point:** `src/client.ts` (`CodebuffClient`) → `src/run.ts` (`run()`) +- **Key responsibilities:** + - Orchestrates agent runs: initializes session state, registers tool handlers, calls `callMainPrompt()` + - **Executes tool calls locally** on the user's machine (file edits, terminal commands, code search) + - Manages model provider selection: Claude OAuth, ChatGPT OAuth, or Codebuff backend + - Handles credentials, retry logic, and error transformation +- **Depends on:** `agent-runtime`, `common`, `internal` (for OpenAI-compatible provider) + +### `packages/agent-runtime/` — Agent Execution Engine + +The core agent loop that drives LLM inference, tool execution, and multi-step reasoning. + +- **Entry point:** `src/main-prompt.ts` → `src/run-agent-step.ts` (`loopAgentSteps()`) +- **Key responsibilities:** + - Runs the agent loop: LLM call → process response → execute tool calls → repeat + - Manages agent templates, system prompts, and tool definitions + - Handles subagent spawning, programmatic agent steps (`handleSteps` generators) + - Processes the AI SDK stream (`streamText()`) and routes tool calls to the SDK + - Manages context token counting, cache debugging, and cost tracking +- **Depends on:** `common`, `agents` (for agent templates) + +### `common/` — Shared Library + +Shared types, utilities, constants, and tool definitions used across the entire monorepo. + +- **Key areas:** + - `src/types/` — TypeScript types: `SessionState`, `AgentOutput`, `Message`, contracts for DI + - `src/tools/` — Tool parameter schemas (Zod), tool names, and tool call validation + - `src/constants/` — Model configs, agent IDs, OAuth settings, billing constants + - `src/util/` — Error handling (`ErrorOr`), message utilities, string helpers, XML parsing + - `src/templates/` — Agent definition types, initial `.agents/` directory template + - `src/testing/` — Mock factories for database, filesystem, analytics, fetch, timers +- **Depends on:** nothing (leaf package) + +### `agents/` — Agent Definitions + +Prompt-based and programmatic agent definitions that ship with Codebuff. + +- **Key agents:** + - `base2/` — The default agent (base2, base2-max, base2-free, base2-plan) + - `editor/` — Code editing specialist with best-of-N selection + - `file-explorer/` — File picker, code searcher, directory lister, glob matcher + - `thinker/` — Deep reasoning agent with best-of-N variants + - `reviewer/` — Code review agent with multi-prompt variant + - `researcher/` — Web search and docs search agents + - `general-agent/` — General-purpose agents (opus-agent, gpt-5-agent) + - `commander.ts` / `commander-lite.ts` — Terminal command execution agents + - `context-pruner.ts` — Conversation summarization to manage context length +- **Depends on:** `common` (for agent definition types and tool params) + +### `web/` — Next.js Web Application + +The Codebuff web server, marketing site, and API. + +- **Key areas:** + - `src/app/api/v1/chat/completions/` — The main LLM proxy endpoint (routes to OpenRouter, Fireworks, OpenAI) + - `src/app/api/v1/` — REST API: agent runs, feedback, usage, web search, docs search, token count + - `src/app/api/auth/` — NextAuth.js authentication (GitHub OAuth) + - `src/app/api/stripe/` — Billing: credit purchases, subscriptions, webhooks + - `src/app/api/agents/` — Agent registry: publish, validate, fetch + - `src/app/api/orgs/` — Organization management: teams, billing, repos + - `src/app/` — Marketing pages, docs (MDX via contentlayer), user profile, pricing + - `src/llm-api/` — LLM provider integrations (OpenRouter, Fireworks, OpenAI, SiliconFlow, CanopyWave) +- **Depends on:** `common`, `internal`, `billing`, `bigquery` + +### `packages/internal/` — Internal Utilities + +Server-side utilities, database schema, and vendor forks shared between `web` and `sdk`. + +- **Key areas:** + - `src/db/` — Drizzle ORM schema (`schema.ts`), migrations, Docker Compose for local Postgres + - `src/env.ts` — Server environment variable validation (@t3-oss/env-nextjs) + - `src/loops/` — Loops email service integration (transactional emails) + - `src/openai-compatible/` — Forked OpenAI-compatible AI SDK provider (used by the SDK to call the Codebuff backend) + - `src/openrouter-ai-sdk/` — Forked OpenRouter AI SDK provider (used by the web server) + - `src/templates/` — Agent template fetching and validation +- **Depends on:** `common` + +### `packages/billing/` — Billing & Credits + +Credit management, subscription handling, and usage tracking. + +- **Key components:** + - `balance-calculator.ts` — Credit balance calculation (free, purchased, rollover, subscription grants) + - `subscription.ts` — Subscription plan management, block grants, weekly limits + - `grant-credits.ts` — Credit grant operations (referral, purchase, admin, free) + - `auto-topup.ts` — Automatic credit purchases when balance is low + - `usage-service.ts` — Usage data aggregation + - `credit-delegation.ts` — Organization credit delegation +- **Depends on:** `common` (for DB access, Stripe utils, types) + +### `packages/bigquery/` — Analytics Data + +Google BigQuery integration for storing agent interaction traces and usage analytics. + +- **Tables:** `traces` (agent interactions), `relabels` (fine-tuning relabeling data) +- **Trace types:** file selection calls, file trees, agent responses, training data, model grading +- **Depends on:** `common` + +### `packages/code-map/` — Code Parsing + +Tree-sitter based source code parser that extracts function/variable names for file tree display. + +- **Supports:** TypeScript, JavaScript, Python, Go, Rust, Java, C, C++, C#, Ruby, PHP +- **Used by:** The `read_subtree` tool to show parsed variable names alongside the file tree +- **Depends on:** nothing (leaf package) + +### `packages/build-tools/` — Build Utilities + +Custom build executors, currently just the Infisical secrets integration. + +### `.agents/` — Local Agent Templates + +Project-specific agent definitions for this repository. These are loaded automatically by the agent runtime. + +- CLI agent templates (claude-code-cli, codex-cli, gemini-cli, codebuff-local-cli) +- Notion query agents +- Skills (cleanup, meta, review) + +### `evals/` — Evaluation Framework + +BuffBench evaluation suite for measuring agent performance on real-world coding tasks. + +- **Workflow:** Pick commits → generate eval tasks → run agents → judge results → extract lessons +- **Runners:** Codebuff, Claude Code, Codex +- **Depends on:** `common`, `agent-runtime`, `sdk` + +### `freebuff/` — Free Tier Product + +A separate free-to-use version of Codebuff with its own CLI binary and web app. + +- `freebuff/cli/` — Standalone CLI binary and release scripts +- `freebuff/web/` — Minimal Next.js app for auth (login, onboarding) +- Uses ChatGPT OAuth for free LLM access (no Codebuff credits required) + +### `scripts/` — Development & Operations + +Developer tooling, analytics scripts, and service management. + +- `start-services.ts` / `stop-services.ts` / `status-services.ts` — Local dev environment management +- `tmux/` — tmux helper scripts for CLI E2E testing +- Analytics: DAU calculation, MRR, subscriber profitability, model usage +- Release: changelog generation, credit grants, worktree management + +## Key Architectural Patterns + +### Dependency Injection via Contracts + +The codebase avoids tight coupling between packages using contract types in `common/src/types/contracts/`: + +- `database.ts` — DB access functions (`GetUserInfoFromApiKeyFn`, `StartAgentRunFn`, etc.) +- `llm.ts` — LLM calling functions (`PromptAiSdkStreamFn`, `PromptAiSdkFn`) +- `analytics.ts` — Event tracking (`TrackEventFn`) +- `client.ts` — Client-server communication (`RequestToolCallFn`, `SendActionFn`) +- `env.ts` — Environment variable access (`BaseEnv`, `ClientEnv`, `CiEnv`) + +This allows the agent-runtime to be used by both the SDK (local execution) and the web server (if needed) without direct dependencies. + +### ErrorOr Pattern + +Prefer `ErrorOr` return values (`success(value)` / `failure(error)`) over throwing exceptions. Defined in `common/src/util/error.ts`. + +### Local Tool Execution + +Tool calls (file edits, terminal commands, code search) execute **on the user's machine** via the SDK, not on the server. The agent-runtime sends tool call requests through `requestToolCall`, which the SDK handles locally. + +### AI SDK Integration + +The project uses Vercel's [AI SDK](https://sdk.vercel.ai/) (`ai` package) for LLM interactions: + +- `streamText()` for streaming responses +- `generateText()` / `generateObject()` for non-streaming +- Custom `OpenAICompatibleChatLanguageModel` provider for the Codebuff backend +- `APICallError` for HTTP error handling (see [Error Schema](./error-schema.md)) + +### Agent Template System + +Agents are defined as templates with: + +- **Prompt agents** — System prompt + tool list + spawnable subagents +- **Programmatic agents** — `handleSteps` generator functions that run in a sandbox +- Templates live in `agents/` (shipped) and `.agents/` (project-local) +- Users can publish agents to the Codebuff registry + +## Development + +```bash +bun up # Start web server + database +bun start-cli # Start CLI (separate terminal) +bun ps # Check running services +bun down # Stop services +bun typecheck # Run all type checks +bun test # Run all tests +``` + +See the [Request Flow](./request-flow.md) doc for the detailed path a prompt takes through the system. diff --git a/docs/error-schema.md b/docs/error-schema.md new file mode 100644 index 0000000000..8cc9b088b5 --- /dev/null +++ b/docs/error-schema.md @@ -0,0 +1,213 @@ +# Error Schema: Server Responses & Client Handling + +This document describes the error responses the Codebuff server sends, how the AI SDK transforms them, and how errors are ultimately displayed in the CLI. + +## Server Error Responses + +**Source:** `web/src/app/api/v1/chat/completions/_post.ts` + +The server returns JSON error responses with an HTTP status code. There are two shapes: + +### Simple errors (message only) + +```json +{ "message": "" } +``` + +Used for: + +| Status | Example message | +|--------|----------------| +| 400 | `"Invalid JSON in request body"` | +| 400 | `"No runId found in request body"` | +| 401 | `"Unauthorized"` | +| 401 | `"Invalid Codebuff API key"` | +| 402 | `"Out of credits. Please add credits at https://codebuff.com/usage. Your free credits reset in 3 hours."` | + +### Typed errors (error code + message) + +```json +{ "error": "", "message": "" } +``` + +Used for errors that the client needs to identify programmatically: + +| Status | `error` code | Example `message` | +|--------|-------------|-------------------| +| 403 | `account_suspended` | `"Your account has been suspended due to billing issues. Please contact support@codebuff.com to resolve this."` | +| 403 | `free_mode_unavailable` | `"Free mode is not available in your country."` | +| 429 | `rate_limit_exceeded` | `"Subscription weekly limit reached. Your limit resets in 2 hours. Enable 'Continue with credits' in the CLI to use a-la-carte credits."` | + +### Catch-all server error + +```json +{ "error": "Failed to process request" } +``` + +The 500 catch-all uses `error` as a human-readable string (no `message` field). This does not follow the typed error pattern above — it's a legacy format. + +### Provider errors + +When the upstream LLM provider (OpenRouter, Fireworks, OpenAI, etc.) returns an error, the server passes it through via the provider's `.toJSON()` format, which varies by provider. + +## The AI SDK Transformation Problem + +The Codebuff backend is called through the AI SDK's `OpenAICompatibleChatLanguageModel`, which treats it as a standard OpenAI-compatible endpoint. When the server returns a non-2xx response, **the AI SDK wraps it** into an `APICallError`: + +``` +Server returns: HTTP 403 { "error": "free_mode_unavailable", "message": "Free mode is not available in your country." } + │ + ▼ +AI SDK creates: APICallError { + message: "Forbidden" ← HTTP status text (NOT the server's message) + statusCode: 403 + responseBody: "{\"error\":\"free_mode_unavailable\",\"message\":\"Free mode is not available in your country.\"}" ← original JSON as a string + } +``` + +The server's human-readable `message` and machine-readable `error` code are buried inside `responseBody` as a JSON string. The `APICallError.message` is just the HTTP status text ("Forbidden", "Payment Required", etc.). + +## Client-Side Error Recovery + +To recover the server's structured error details, we use `parseApiErrorResponseBody()` from `common/src/util/error.ts`: + +```typescript +export function parseApiErrorResponseBody(responseBody: unknown): { + errorCode?: string + message?: string +} +``` + +This is called in two places: + +### 1. Agent Runtime catch block + +**File:** `packages/agent-runtime/src/run-agent-step.ts` (in `loopAgentSteps`) + +This is the **primary** error handler. Most API errors are caught here because the error occurs during `runAgentStep()` → `promptAiSdkStream()` → `streamText()`. + +```typescript +catch (error) { + if (error instanceof APICallError) { + const parsed = parseApiErrorResponseBody(error.responseBody) + // parsed.errorCode = 'free_mode_unavailable' + // parsed.message = 'Free mode is not available in your country.' + } + // ... + return { + output: { + type: 'error', + message: hasServerMessage ? errorMessage : 'Agent run error: ' + errorMessage, + statusCode, + error: errorCode, // ← machine-readable code for client matching + }, + } +} +``` + +### 2. SDK .catch() handler + +**File:** `sdk/src/run.ts` (in `callMainPrompt().catch()`) + +This is a **fallback** handler for errors that escape the agent runtime (e.g., errors during setup before the agent loop starts). + +## Error Output Schema + +**File:** `common/src/types/session-state.ts` + +The `AgentOutputSchema` defines the Zod schema for agent output. The error variant: + +```typescript +z.object({ + type: z.literal('error'), + message: z.string(), + statusCode: z.number().optional(), + error: z.string().optional(), // machine-readable error code +}) +``` + +All three fields flow through to the CLI. + +## CLI Error Handling + +**Files:** `cli/src/utils/error-handling.ts`, `cli/src/hooks/helpers/send-message.ts` + +The CLI checks the output for known error types: + +```typescript +// Checks statusCode === 402 +isOutOfCreditsError(output) → shows OUT_OF_CREDITS_MESSAGE + +// Checks statusCode === 403 && error === 'free_mode_unavailable' +isFreeModeUnavailableError(output) → shows FREE_MODE_UNAVAILABLE_MESSAGE +``` + +For all other errors, the raw `output.message` is displayed in the `UserErrorBanner`. + +## Error Flow Diagram + +``` + Server AI SDK Agent Runtime SDK CLI + │ │ │ │ │ + │ HTTP 403 │ │ │ │ + │ { error, message } │ │ │ │ + │────────────────────────▶│ │ │ │ + │ │ APICallError │ │ │ + │ │ .message="Forbidden" │ │ │ + │ │ .responseBody="{...}" │ │ │ + │ │────────────────────────▶│ │ │ + │ │ │ catch (APICallError) │ │ + │ │ │ parseResponseBody() │ │ + │ │ │ extract error code │ │ + │ │ │ extract message │ │ + │ │ │─────────────────────▶ │ │ + │ │ │ prompt-response │ │ + │ │ │ { type: 'error', │ │ + │ │ │ statusCode: 403, │ │ + │ │ │ error: '...', │ │ + │ │ │ message: '...' } │ │ + │ │ │ │─────────────────────▶│ + │ │ │ │ handleRunCompletion │ + │ │ │ │ isFreeModeUnavail.. │ + │ │ │ │ show friendly msg │ +``` + +## Adding a New Server Error Type + +To add a new error type that the CLI can identify and handle specially: + +1. **Server** (`web/src/app/api/v1/chat/completions/_post.ts`): Return a typed error: + ```typescript + return NextResponse.json( + { error: 'your_error_code', message: 'User-friendly message.' }, + { status: 4xx }, + ) + ``` + +2. **CLI error detection** (`cli/src/utils/error-handling.ts`): Add a checker: + ```typescript + export const isYourError = (error: unknown): boolean => { + if ( + error && + typeof error === 'object' && + 'statusCode' in error && + (error as { statusCode: unknown }).statusCode === 4xx && + 'error' in error && + (error as { error: unknown }).error === 'your_error_code' + ) { + return true + } + return false + } + ``` + +3. **CLI display** (`cli/src/hooks/helpers/send-message.ts`): Handle it in `handleRunCompletion`: + ```typescript + if (isYourError(output)) { + updater.setError(YOUR_ERROR_MESSAGE) + finalizeAfterError() + return + } + ``` + +No changes needed in the agent runtime or SDK — `parseApiErrorResponseBody` automatically extracts any `error` and `message` fields from the server's response body. diff --git a/docs/request-flow.md b/docs/request-flow.md new file mode 100644 index 0000000000..427611525f --- /dev/null +++ b/docs/request-flow.md @@ -0,0 +1,180 @@ +# Request Flow: CLI → Server → CLI + +This document traces the exact path a user prompt takes from the Codebuff CLI through the SDK, agent runtime, server, and back. + +## Overview + +``` +┌─────────┐ ┌─────────┐ ┌───────────────┐ ┌────────────────┐ ┌──────────┐ +│ CLI │───▶│ SDK │───▶│ Agent Runtime │───▶│ Codebuff Server│───▶│ LLM API │ +│ (TUI) │◀───│ run.ts │◀───│ loopAgentSteps│◀───│ /v1/chat/... │◀───│(OR/OAI/..)│ +└─────────┘ └─────────┘ └───────────────┘ └────────────────┘ └──────────┘ +``` + +## Step-by-Step Flow + +### 1. CLI: User Input + +**Files:** `cli/src/hooks/use-send-message.ts`, `cli/src/hooks/helpers/send-message.ts` + +1. User types a prompt and hits Enter. +2. `prepareUserMessage()` processes the input: + - Collects pending bash context (terminal output since last prompt) + - Processes image and text attachments + - Creates a user message in the chat UI +3. `setupStreamingContext()` initializes: + - An `AbortController` (for user cancellation via Escape) + - A timer (tracks elapsed time) + - A batched message updater (efficiently updates the UI) +4. The CLI calls `client.run()` from the SDK. + +### 2. SDK: Orchestration + +**File:** `sdk/src/run.ts` + +1. `run()` → `runOnce()` is called with the prompt, agent ID, cost mode, and session state. +2. **Session state** is initialized (fresh) or restored (from `previousRun`). +3. **User identity** is verified via `getUserInfoFromApiKey()` (calls the web API). +4. **Tool handlers** are registered — these execute locally on the user's machine: + - `write_file`, `str_replace`, `apply_patch` → file edits + - `run_terminal_command` → shell commands + - `code_search`, `glob`, `list_directory` → file search + - `read_files` → file reading + - Custom tool definitions and MCP tools +5. **Action handlers** are registered to process server responses: + - `response-chunk` → streams text to the CLI + - `subagent-response-chunk` → streams subagent output + - `prompt-response` → final result (resolves the promise) + - `prompt-error` → error result +6. `callMainPrompt()` is called (fire-and-forget, with a `.catch()` handler). +7. The function returns a promise that resolves when `prompt-response` or an error arrives. + +### 3. Agent Runtime: Main Prompt + +**File:** `packages/agent-runtime/src/main-prompt.ts` + +1. `callMainPrompt()` resets credits to 0 (server controls cost tracking). +2. Assembles **local agent templates** from the project's `.agents/` directory. +3. Sends a `response-chunk` `start` event to the CLI. +4. `mainPrompt()` determines the **agent type** based on cost mode: + - `free` → `base-free` + - `normal` → `base` + - `max` → `base-max` + - `ask` → `ask` + - `experimental` → `base2` + - Fallback (default) → `base2` + - Or a custom agent ID +5. Calls `loopAgentSteps()` with the agent template, prompt, and session state. + +### 4. Agent Runtime: Agent Loop + +**File:** `packages/agent-runtime/src/run-agent-step.ts` + +1. `loopAgentSteps()` starts an **agent run** (recorded in the database). +2. Builds the **system prompt**, **tool definitions**, and **initial messages**. +3. Enters the main loop: + ``` + while (true) { + // 1. Run programmatic step (if agent has handleSteps) + // 2. Check if turn should end + // 3. Call runAgentStep() for LLM inference + // 4. Process tool calls and responses + } + ``` +4. Each `runAgentStep()` call: + - Checks context token count via the `/api/v1/token-count` endpoint + - Calls `getAgentStreamFromTemplate()` → `promptAiSdkStream()` + - `processStream()` iterates over the AI SDK stream, handling text chunks and tool calls + - Tool calls are sent back to the SDK via `requestToolCall`, executed locally, and results fed back +5. The loop continues until the agent signals completion (no more tool calls, or `task_completed` tool). +6. Sends a `response-chunk` `finish` event, then a `prompt-response` action with the final session state and output. + +### 5. LLM Call: Model Provider Selection + +**Files:** `sdk/src/impl/llm.ts`, `sdk/src/impl/model-provider.ts` + +`promptAiSdkStream()` selects the model provider: + +1. **Claude OAuth** — If the user has connected their Claude subscription and the model is a Claude model, requests go directly to `api.anthropic.com` using the user's OAuth token. Zero cost to the user's Codebuff credits. +2. **ChatGPT OAuth** — If the user has connected their ChatGPT subscription and the model is an OpenAI model, requests go to the ChatGPT backend API. +3. **Codebuff Backend** (default) — Requests go to `POST /api/v1/chat/completions` on the Codebuff web server, which routes to the appropriate LLM provider. + +For OAuth providers, rate limit errors trigger automatic fallback to the Codebuff backend (unless in free mode). + +The AI SDK's `streamText()` function handles the actual HTTP call, streaming, and retry logic. + +### 6. Server: Chat Completions Endpoint + +**File:** `web/src/app/api/v1/chat/completions/_post.ts` + +The server processes the request through several validation gates: + +1. **Parse request body** — Returns 400 if invalid JSON. +2. **Authenticate** — Extracts API key from `Authorization` header. Returns 401 if missing/invalid. +3. **Check ban status** — Returns 403 `account_suspended` if user is banned. +4. **Free mode country check** — For free mode requests, checks user's IP against allowed countries. Returns 403 `free_mode_unavailable` if not allowed. +5. **Validate agent run** — Checks the `run_id` exists and is in `running` status. Returns 400 if invalid. +6. **Subscription block grant** — For subscribers, ensures a billing block is active. Returns 429 `rate_limit_exceeded` if limit hit and fallback disabled. +7. **Credit check** — Returns 402 if user has no remaining credits (and not a free mode request). +8. **Route to LLM provider** — Based on the model, routes to: + - Fireworks AI (for supported models) + - OpenAI direct (for OpenAI models) + - OpenRouter (default, for all other models) +9. **Return response** — Streaming requests return an SSE stream (`text/event-stream`). Non-streaming requests return JSON. + +### 7. Response Flow Back to CLI + +1. The LLM provider streams tokens back to the server. +2. The server forwards the SSE stream to the AI SDK client. +3. `promptAiSdkStream()` yields chunks from the AI SDK's `fullStream`: + - `text-delta` → text content + - `tool-call` → tool invocation + - `error` → error handling (OAuth fallback, retries, etc.) +4. `processStream()` in agent-runtime handles each chunk: + - Text chunks → `sendAction({ type: 'response-chunk', chunk })` → SDK → CLI UI + - Tool calls → `requestToolCall()` → SDK executes locally → result fed back to stream +5. When the agent loop finishes, `callMainPrompt` sends: + - A `response-chunk` `finish` event (with total cost) + - A `prompt-response` action (with final session state and output) +6. The SDK's `handlePromptResponse()` validates the output against `AgentOutputSchema` and resolves the promise. +7. The CLI's `handleRunCompletion()` processes the result: + - Checks for known error types (out of credits, free mode unavailable) + - Updates the UI with completion time and credit cost + - Marks the message as complete + +## Tool Call Lifecycle + +Tool calls execute **locally on the user's machine**, not on the server: + +``` +LLM Response (tool_call) Agent Runtime processes stream + │ │ + ▼ ▼ + processStream() ─── requestToolCall ──▶ SDK run.ts + │ │ + │ handleToolCall() + │ │ + │ Executes locally + │ (file edit, terminal, search) + │ │ + ◀─────── tool result ───────────────┘ + │ + Feeds result back into next LLM call +``` + +## Session State + +Session state persists across prompts within a conversation: + +- `sessionState.mainAgentState.messageHistory` — Full conversation history +- `sessionState.fileContext` — Project files, knowledge files, custom tools +- The CLI stores the `RunState` from each run and passes it as `previousRun` to the next `client.run()` call + +## Cancellation + +When the user presses Escape: + +1. CLI aborts the `AbortController` +2. The `abort` signal propagates through the SDK → agent runtime → AI SDK +3. `loopAgentSteps` catches the `AbortError`, marks the run as `cancelled` +4. CLI's abort handler shows an interruption notice and marks the message complete diff --git a/freebuff/cli/release/package.json b/freebuff/cli/release/package.json index c893ed5cab..f5302ff59c 100644 --- a/freebuff/cli/release/package.json +++ b/freebuff/cli/release/package.json @@ -1,6 +1,6 @@ { "name": "freebuff", - "version": "0.0.14", + "version": "0.0.15", "description": "The world's strongest free coding agent", "license": "MIT", "bin": { diff --git a/freebuff/e2e/README.md b/freebuff/e2e/README.md new file mode 100644 index 0000000000..861d31f5be --- /dev/null +++ b/freebuff/e2e/README.md @@ -0,0 +1,169 @@ +# Freebuff E2E Tests + +End-to-end tests for the Freebuff CLI binary. Tests verify that the compiled binary works correctly by interacting with it via tmux. + +## Architecture + +Two testing approaches are supported: + +### 1. Direct tmux tests (fast, deterministic) + +Use the `FreebuffSession` class to start the binary in tmux, send commands, capture output, and assert directly. + +```typescript +import { describe, test, expect, afterEach } from 'bun:test' +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +describe('My Feature', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) await session.stop() + session = null + }) + + test('works correctly', async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary) + + await session.send('/help') + const output = await session.capture(2) + + expect(output).toContain('Shortcuts') + }, 60_000) +}) +``` + +### 2. SDK agent-driven tests (AI-powered verification) + +Use the Codebuff SDK to run a testing agent that interacts with Freebuff via custom tmux tools. The agent reasons about the CLI output and verifies complex behaviors. + +```typescript +import { describe, test, expect, afterEach } from 'bun:test' +import { CodebuffClient } from '@codebuff/sdk' +import { freebuffTesterAgent } from '../agent/freebuff-tester' +import { createFreebuffTmuxTools, requireFreebuffBinary } from '../utils' + +describe('Agent Test', () => { + let cleanup: (() => Promise) | null = null + + afterEach(async () => { + if (cleanup) await cleanup() + cleanup = null + }) + + test('verifies startup', async () => { + const apiKey = process.env.CODEBUFF_API_KEY + if (!apiKey) return // Skip if no API key + + const binary = requireFreebuffBinary() + const tmuxTools = createFreebuffTmuxTools(binary) + cleanup = tmuxTools.cleanup + + const client = new CodebuffClient({ apiKey }) + const result = await client.run({ + agent: freebuffTesterAgent.id, + prompt: 'Start Freebuff and verify the branding is correct.', + agentDefinitions: [freebuffTesterAgent], + customToolDefinitions: tmuxTools.tools, + handleEvent: () => {}, + }) + + expect(result.output.type).not.toBe('error') + }, 180_000) +}) +``` + +## Prerequisites + +- **tmux** must be installed: `brew install tmux` (macOS) or `sudo apt-get install tmux` (Ubuntu) +- **Freebuff binary** must be built: `bun freebuff/cli/build.ts 0.0.0-dev` +- **SDK built** (for agent tests): `cd sdk && bun run build` +- **CODEBUFF_API_KEY** (for agent tests only): Set this environment variable + +## Running Tests + +### Build the binary first + +```bash +bun freebuff/cli/build.ts 0.0.0-dev +``` + +### Run all tests + +```bash +bun test freebuff/e2e/tests/ +``` + +### Run a specific test + +```bash +bun test freebuff/e2e/tests/version.e2e.test.ts +bun test freebuff/e2e/tests/startup.e2e.test.ts +bun test freebuff/e2e/tests/help-command.e2e.test.ts +bun test freebuff/e2e/tests/agent-startup.e2e.test.ts +``` + +### Use a custom binary path + +```bash +FREEBUFF_BINARY=/path/to/freebuff bun test freebuff/e2e/tests/ +``` + +## Adding New Tests + +1. Create a new file in `freebuff/e2e/tests/` with the naming convention `.e2e.test.ts` +2. Add the test name to `.github/workflows/freebuff-e2e.yml` matrix: + +```yaml +matrix: + test: + - version + - startup + - help-command + - agent-startup + - your-new-test # <-- add here +``` + +3. The test will automatically run in parallel with other tests in CI. + +## CI Workflow + +The `.github/workflows/freebuff-e2e.yml` workflow: + +1. **Builds** the Freebuff binary once (linux-x64) +2. **Runs each test file in parallel** via GitHub Actions matrix strategy +3. **Uploads tmux session logs** on failure for debugging + +Triggers: +- **Nightly** at 6:00 AM PT +- **Manual** via workflow_dispatch + +## Utilities Reference + +### `FreebuffSession` + +| Method | Description | +|--------|-------------| +| `FreebuffSession.start(binaryPath)` | Start binary in tmux, returns session | +| `session.send(text)` | Send text input (presses Enter) | +| `session.sendKey(key)` | Send special key (e.g. `'C-c'`, `'Escape'`) | +| `session.capture(waitSec?)` | Capture terminal output | +| `session.captureLabeled(label, waitSec?)` | Capture and save to session logs | +| `session.waitForText(pattern, timeoutMs?)` | Poll until text appears | +| `session.stop()` | Stop session and clean up | + +### `createFreebuffTmuxTools(binaryPath)` + +Creates SDK custom tools for agent-driven testing: +- `start_freebuff` - Launch the CLI +- `send_to_freebuff` - Send text input +- `capture_freebuff_output` - Capture terminal output +- `stop_freebuff` - Stop and clean up + +### Helper functions + +| Function | Description | +|----------|-------------| +| `requireFreebuffBinary()` | Get binary path, throws if not found | +| `getFreebuffBinaryPath()` | Get binary path (may not exist) | diff --git a/freebuff/e2e/agent/freebuff-tester.ts b/freebuff/e2e/agent/freebuff-tester.ts new file mode 100644 index 0000000000..a58d6dfb49 --- /dev/null +++ b/freebuff/e2e/agent/freebuff-tester.ts @@ -0,0 +1,52 @@ +import type { AgentDefinition } from '@codebuff/sdk' + +/** + * Agent definition for testing the Freebuff CLI via tmux. + * + * This agent is designed to be used with the custom tmux tools from + * `createFreebuffTmuxTools()`. It receives a testing task in its prompt + * and uses tmux tools to start Freebuff, interact with it, and verify behavior. + * + * Example usage: + * ```ts + * const { tools, cleanup } = createFreebuffTmuxTools(binaryPath) + * const result = await client.run({ + * agent: freebuffTesterAgent.id, + * prompt: 'Start freebuff and verify the welcome screen shows Freebuff branding', + * agentDefinitions: [freebuffTesterAgent], + * customToolDefinitions: tools, + * handleEvent: collector.handleEvent, + * }) + * await cleanup() + * ``` + */ +export const freebuffTesterAgent: AgentDefinition = { + id: 'freebuff-tester', + displayName: 'Freebuff E2E Tester', + model: 'anthropic/claude-sonnet-4.5', + toolNames: [ + 'start_freebuff', + 'send_to_freebuff', + 'capture_freebuff_output', + 'stop_freebuff', + ], + instructionsPrompt: `You are a QA tester for the Freebuff CLI application. + +Your job is to verify that Freebuff behaves correctly by interacting with it +through tmux tools. Follow these steps: + +1. Call start_freebuff to launch the CLI +2. Use capture_freebuff_output (with waitSeconds) to see the terminal output +3. Use send_to_freebuff to type commands or text +4. Capture output again to verify behavior +5. ALWAYS call stop_freebuff when done + +Key things to verify: +- The CLI starts without errors or crashes +- Branding shows "Freebuff" (not "Codebuff") +- Commands work as expected +- Error messages are user-friendly + +Report your findings clearly. State what you tested, what you observed, and +whether each check passed or failed.`, +} diff --git a/freebuff/e2e/tests/ads-behavior.e2e.test.ts b/freebuff/e2e/tests/ads-behavior.e2e.test.ts new file mode 100644 index 0000000000..1ba9fe4d4e --- /dev/null +++ b/freebuff/e2e/tests/ads-behavior.e2e.test.ts @@ -0,0 +1,79 @@ +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 60_000 + +describe('Freebuff: Ads Behavior', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'ads:enable command is not available', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Type "/ads" to check for ads commands in autocomplete + await session.send('/ads', { noEnter: true }) + const output = await session.capture(2) + + // Neither ads:enable nor ads:disable should appear + expect(output).not.toContain('ads:enable') + expect(output).not.toContain('ads:disable') + }, + TEST_TIMEOUT, + ) + + test( + 'ads:disable command is not available', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Try to send the /ads:disable command + await session.send('/ads:disable') + const output = await session.capture(3) + + // The command should not be recognized + // It should NOT show "Ads disabled" confirmation + expect(output).not.toMatch(/ads disabled/i) + }, + TEST_TIMEOUT, + ) + + test( + 'does not show credits earned from ads', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + const output = await session.capture() + + // In Freebuff, ads don't show "+X credits" because credits don't apply + // Check the startup screen doesn't mention ad credits + expect(output).not.toMatch(/\+\d+ credits/) + }, + TEST_TIMEOUT, + ) + + test( + 'does not show "Hide ads" option', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + const output = await session.capture() + + // In Freebuff, the "Hide ads" link is not shown because ads are mandatory + expect(output).not.toContain('Hide ads') + // Also should not mention /ads:enable as a way to re-enable + expect(output).not.toContain('/ads:enable') + }, + TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/agent-startup.e2e.test.ts b/freebuff/e2e/tests/agent-startup.e2e.test.ts new file mode 100644 index 0000000000..6d436758a8 --- /dev/null +++ b/freebuff/e2e/tests/agent-startup.e2e.test.ts @@ -0,0 +1,122 @@ +/** + * Agent-driven E2E test for Freebuff. + * + * Uses the Codebuff SDK to run a testing agent that interacts with the + * Freebuff CLI binary via tmux custom tools. Requires CODEBUFF_API_KEY. + * + * Set CODEBUFF_API_KEY to run this test, otherwise it will be skipped. + */ + +import { afterEach, describe, expect, test } from 'bun:test' + +import { freebuffTesterAgent } from '../agent/freebuff-tester' +import { createFreebuffTmuxTools, requireFreebuffBinary } from '../utils' + +import type { CodebuffClient as CodebuffClientType } from '@codebuff/sdk' + +const AGENT_TEST_TIMEOUT = 180_000 + +function getApiKey(): string | null { + return process.env.CODEBUFF_API_KEY ?? null +} + +describe('Freebuff: Agent-driven E2E', () => { + let cleanup: (() => Promise) | null = null + + afterEach(async () => { + if (cleanup) { + await cleanup() + cleanup = null + } + }) + + test( + 'agent can start freebuff and verify startup behavior', + async () => { + const apiKey = getApiKey() + if (!apiKey) { + console.log( + 'Skipping agent test: CODEBUFF_API_KEY not set. ' + + 'Set it to run agent-driven e2e tests.', + ) + return + } + + const binary = requireFreebuffBinary() + const tmuxTools = createFreebuffTmuxTools(binary) + cleanup = tmuxTools.cleanup + + // Dynamically import SDK to avoid build-time dependency issues + const { CodebuffClient } = (await import( + '@codebuff/sdk' + )) as typeof import('@codebuff/sdk') + + const client: CodebuffClientType = new CodebuffClient({ apiKey }) + + const events: Array<{ type: string; [key: string]: unknown }> = [] + + const result = await client.run({ + agent: freebuffTesterAgent.id, + prompt: + 'Start Freebuff using the start_freebuff tool. Then capture the output ' + + 'with capture_freebuff_output (waitSeconds: 3). Verify that:\n' + + '1. The CLI started without errors\n' + + '2. The output contains "freebuff" (case-insensitive)\n' + + '3. The output does NOT contain "codebuff" (case-insensitive)\n' + + 'Finally, call stop_freebuff to clean up. Report your findings.', + agentDefinitions: [freebuffTesterAgent], + customToolDefinitions: tmuxTools.tools, + handleEvent: (event) => { + events.push(event) + }, + }) + + expect(result.output.type).not.toBe('error') + + // Verify the agent used the tmux tools + const toolCalls = events.filter((e) => e.type === 'tool_call') + const toolNames = toolCalls.map((e) => e.toolName) + expect(toolNames).toContain('start_freebuff') + expect(toolNames).toContain('capture_freebuff_output') + expect(toolNames).toContain('stop_freebuff') + }, + AGENT_TEST_TIMEOUT, + ) + + test( + 'agent can send commands and verify output', + async () => { + const apiKey = getApiKey() + if (!apiKey) { + console.log('Skipping agent test: CODEBUFF_API_KEY not set.') + return + } + + const binary = requireFreebuffBinary() + const tmuxTools = createFreebuffTmuxTools(binary) + cleanup = tmuxTools.cleanup + + const { CodebuffClient } = (await import( + '@codebuff/sdk' + )) as typeof import('@codebuff/sdk') + + const client: CodebuffClientType = new CodebuffClient({ apiKey }) + + const result = await client.run({ + agent: freebuffTesterAgent.id, + prompt: + 'Start Freebuff, wait for it to load (capture with waitSeconds: 5), ' + + 'then send the "/help" command using send_to_freebuff. ' + + 'Capture the output after 2 seconds. ' + + 'Verify the help content is displayed. ' + + 'Stop Freebuff when done and report your findings.', + agentDefinitions: [freebuffTesterAgent], + customToolDefinitions: tmuxTools.tools, + handleEvent: () => {}, + }) + + expect(result.output.type).not.toBe('error') + }, + AGENT_TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/code-edit.e2e.test.ts b/freebuff/e2e/tests/code-edit.e2e.test.ts new file mode 100644 index 0000000000..957ccac7f9 --- /dev/null +++ b/freebuff/e2e/tests/code-edit.e2e.test.ts @@ -0,0 +1,75 @@ +/** + * E2E test that verifies Freebuff can perform a simple code edit. + * + * Starts Freebuff in tmux, sends a prompt asking it to add a console.log + * to a file, and verifies the file was modified correctly. + * + * Requires CODEBUFF_API_KEY — skipped if not set. + */ + +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 180_000 + +function getApiKey(): string | null { + return process.env.CODEBUFF_API_KEY ?? null +} + +describe('Freebuff: Code Edit', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'adds a console.log to a file', + async () => { + if (!getApiKey()) { + console.log( + 'Skipping code-edit test: CODEBUFF_API_KEY not set. ' + + 'Set it to run code-edit e2e tests.', + ) + return + } + + const binary = requireFreebuffBinary() + const initialContent = [ + 'function greet(name) {', + " return 'Hello, ' + name", + '}', + '', + ].join('\n') + + // Create the file before starting freebuff so it's in the initial context + session = await FreebuffSession.start(binary, { + waitSeconds: 5, + initialFiles: { 'index.js': initialContent }, + }) + + // Verify the file was created + expect(session.readFile('index.js')).toBe(initialContent) + + // Send a prompt asking freebuff to add a console.log + await session.send("Add a console.log('hello world') to index.js") + + // Wait for the file to be modified with the console.log + const finalContent = await session.waitForFileContent( + 'index.js', + 'console.log', + 120_000, + ) + + expect(finalContent).toContain('console.log') + expect(finalContent).toContain('hello world') + // The original function should still be present + expect(finalContent).toContain('function greet') + }, + TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/help-command.e2e.test.ts b/freebuff/e2e/tests/help-command.e2e.test.ts new file mode 100644 index 0000000000..173a3425b8 --- /dev/null +++ b/freebuff/e2e/tests/help-command.e2e.test.ts @@ -0,0 +1,77 @@ +import { execSync } from 'child_process' + +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 60_000 + +describe('Freebuff: --help flag', () => { + test('shows CLI usage information', () => { + const binary = requireFreebuffBinary() + const output = execSync(`'${binary}' --help`, { + encoding: 'utf-8', + timeout: 10_000, + }) + + // Should show the binary name + expect(output.toLowerCase()).toContain('freebuff') + + // Should show usage info + expect(output).toMatch(/usage|options|commands/i) + }) + + test('does not reference Codebuff', () => { + const binary = requireFreebuffBinary() + const output = execSync(`'${binary}' --help`, { + encoding: 'utf-8', + timeout: 10_000, + }) + + // The --help output should say Freebuff, not Codebuff + expect(output).not.toMatch(/\bcodebuff\b/i) + }) +}) + +describe('Freebuff: /help slash command', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'shows help content when /help is entered', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + await session.send('/help') + const output = await session.capture(2) + + // Should show shortcuts section + expect(output).toMatch(/shortcut|ctrl|esc/i) + }, + TEST_TIMEOUT, + ) + + test( + 'does not show subscription commands in help', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + await session.send('/help') + const output = await session.capture(2) + + // Freebuff should NOT show these paid/subscription commands + expect(output).not.toContain('/subscribe') + expect(output).not.toContain('/usage') + expect(output).not.toContain('/credits') + }, + TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/slash-commands.e2e.test.ts b/freebuff/e2e/tests/slash-commands.e2e.test.ts new file mode 100644 index 0000000000..8631a3d4e6 --- /dev/null +++ b/freebuff/e2e/tests/slash-commands.e2e.test.ts @@ -0,0 +1,107 @@ +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 60_000 + +/** + * Commands that should be REMOVED in Freebuff. + * These are stripped at build time via the FREEBUFF_REMOVED_COMMAND_IDS set + * in cli/src/data/slash-commands.ts. + */ +const REMOVED_COMMANDS = [ + '/subscribe', + '/usage', + '/credits', + '/ads:enable', + '/ads:disable', + '/connect:claude', + '/refer-friends', + '/agent:gpt-5', + '/image', + '/publish', + '/init', +] + +/** + * Commands that should be KEPT in Freebuff. + * Only includes commands reliably visible in the initial autocomplete viewport. + * Commands like /logout and /exit exist but may be scrolled off-screen. + */ +const KEPT_COMMANDS = [ + '/help', + '/new', + '/history', + '/feedback', + '/bash', + '/theme:toggle', +] + +describe('Freebuff: Slash Commands', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'slash command menu does not show removed commands', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Type "/" to trigger the slash command autocomplete menu + await session.send('/', { noEnter: true }) + const output = await session.capture(2) + + // Removed commands should NOT appear in the autocomplete menu + for (const cmd of REMOVED_COMMANDS) { + // Strip the leading slash for matching since the menu shows command ids + const cmdId = cmd.slice(1) + expect(output).not.toContain(cmdId) + } + }, + TEST_TIMEOUT, + ) + + test( + 'slash command menu shows kept commands', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Type "/" to trigger the slash command autocomplete menu + await session.send('/', { noEnter: true }) + const output = await session.capture(2) + + // Kept commands SHOULD appear in the autocomplete menu + for (const cmd of KEPT_COMMANDS) { + const cmdId = cmd.slice(1) + expect(output).toContain(cmdId) + } + }, + TEST_TIMEOUT, + ) + + test( + 'no mode-related slash commands are visible', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Type "/mode" to check for mode commands + await session.send('/mode', { noEnter: true }) + const output = await session.capture(2) + + // Mode commands should not exist in Freebuff + expect(output).not.toContain('mode:max') + expect(output).not.toContain('mode:default') + expect(output).not.toContain('mode:lite') + expect(output).not.toContain('mode:free') + }, + TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/startup.e2e.test.ts b/freebuff/e2e/tests/startup.e2e.test.ts new file mode 100644 index 0000000000..173520bfaa --- /dev/null +++ b/freebuff/e2e/tests/startup.e2e.test.ts @@ -0,0 +1,71 @@ +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const STARTUP_TIMEOUT = 60_000 + +describe('Freebuff: Startup', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'binary starts without crashing', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary) + const output = await session.capture(3) + + // Should not contain fatal errors + expect(output).not.toContain('FATAL') + expect(output).not.toContain('panic') + expect(output).not.toContain('Segmentation fault') + + // Should have some visible output (not a blank screen) + const nonEmptyLines = output + .split('\n') + .filter((line) => line.trim().length > 0) + expect(nonEmptyLines.length).toBeGreaterThan(0) + }, + STARTUP_TIMEOUT, + ) + + test( + 'shows Freebuff branding', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary) + const output = await session.capture(3) + + // The CLI should identify itself as Freebuff, not Codebuff + const lowerOutput = output.toLowerCase() + expect(lowerOutput).toContain('freebuff') + }, + STARTUP_TIMEOUT, + ) + + test( + 'responds to Ctrl+C gracefully', + async () => { + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary) + + // Wait for startup, then send Ctrl+C + await session.capture(2) + await session.sendKey('C-c') + + // Give it a moment to process + const output = await session.capture(1) + + // Should not show an unhandled error + expect(output).not.toContain('Unhandled') + expect(output).not.toContain('FATAL') + }, + STARTUP_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/terminal-command.e2e.test.ts b/freebuff/e2e/tests/terminal-command.e2e.test.ts new file mode 100644 index 0000000000..9c3486d1ed --- /dev/null +++ b/freebuff/e2e/tests/terminal-command.e2e.test.ts @@ -0,0 +1,68 @@ +/** + * E2E test that verifies Freebuff can run terminal commands. + * + * Starts Freebuff in tmux, sends a prompt asking it to run a shell command, + * and verifies the command was executed by checking its side effects. + * + * Requires CODEBUFF_API_KEY — skipped if not set. + */ + +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 180_000 + +function getApiKey(): string | null { + return process.env.CODEBUFF_API_KEY ?? null +} + +describe('Freebuff: Terminal Command', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'runs a terminal command that creates a file', + async () => { + if (!getApiKey()) { + console.log( + 'Skipping terminal-command test: CODEBUFF_API_KEY not set. ' + + 'Set it to run terminal-command e2e tests.', + ) + return + } + + const binary = requireFreebuffBinary() + session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + + // Ask freebuff to run a shell command whose output can only come from + // actual terminal execution (not file-writing tools) + await session.send( + 'Use the terminal to run: date +%s > timestamp.txt && echo done', + ) + + // Wait for the file to be created by the terminal command + const content = await session.waitForFileContent( + 'timestamp.txt', + '', + 120_000, + ) + + // The file should contain a Unix timestamp (numeric string) + const trimmed = content.trim() + expect(trimmed).toMatch(/^\d{10,}$/) + + // Verify the timestamp is recent (within the last 5 minutes) + const timestamp = parseInt(trimmed, 10) + const now = Math.floor(Date.now() / 1000) + expect(Math.abs(now - timestamp)).toBeLessThan(300) + }, + TEST_TIMEOUT, + ) +}) diff --git a/freebuff/e2e/tests/version.e2e.test.ts b/freebuff/e2e/tests/version.e2e.test.ts new file mode 100644 index 0000000000..d204bd684e --- /dev/null +++ b/freebuff/e2e/tests/version.e2e.test.ts @@ -0,0 +1,24 @@ +import { execSync } from 'child_process' + +import { describe, expect, test } from 'bun:test' + +import { requireFreebuffBinary } from '../utils' + +describe('Freebuff: --version', () => { + test('outputs a version string', () => { + const binary = requireFreebuffBinary() + const output = execSync(`'${binary}' --version`, { + encoding: 'utf-8', + timeout: 10_000, + }).trim() + + // Should contain a semver-like version (e.g. "0.0.15" or "1.0.0") + expect(output).toMatch(/\d+\.\d+\.\d+/) + }) + + test('exits with code 0', () => { + const binary = requireFreebuffBinary() + // execSync throws on non-zero exit codes, so if this doesn't throw, it exited 0 + execSync(`'${binary}' --version`, { encoding: 'utf-8', timeout: 10_000 }) + }) +}) diff --git a/freebuff/e2e/utils/binary-helpers.ts b/freebuff/e2e/utils/binary-helpers.ts new file mode 100644 index 0000000000..c233574dd4 --- /dev/null +++ b/freebuff/e2e/utils/binary-helpers.ts @@ -0,0 +1,24 @@ +import { existsSync } from 'fs' +import { dirname, resolve } from 'path' +import { fileURLToPath } from 'url' + +const __dirname = dirname(fileURLToPath(import.meta.url)) +export const REPO_ROOT = resolve(__dirname, '../../..') + +export function getFreebuffBinaryPath(): string { + if (process.env.FREEBUFF_BINARY) { + return resolve(process.env.FREEBUFF_BINARY) + } + return resolve(REPO_ROOT, 'cli/bin/freebuff') +} + +export function requireFreebuffBinary(): string { + const binaryPath = getFreebuffBinaryPath() + if (!existsSync(binaryPath)) { + throw new Error( + `Freebuff binary not found at ${binaryPath}. ` + + 'Build with: bun freebuff/cli/build.ts ', + ) + } + return binaryPath +} diff --git a/freebuff/e2e/utils/freebuff-session.ts b/freebuff/e2e/utils/freebuff-session.ts new file mode 100644 index 0000000000..5521534434 --- /dev/null +++ b/freebuff/e2e/utils/freebuff-session.ts @@ -0,0 +1,162 @@ +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { tmuxCapture, tmuxSend, tmuxSendKey, tmuxStart, tmuxStop } from './tmux-helpers' + +export class FreebuffSession { + public readonly name: string + public readonly workDir: string + + private constructor(sessionName: string, workDir: string) { + this.name = sessionName + this.workDir = workDir + } + + /** + * Start a freebuff binary in a tmux session. + * Creates a temporary working directory to simulate a real user project. + */ + static async start( + binaryPath: string, + options?: { + waitSeconds?: number + width?: number + height?: number + initialFiles?: Record + }, + ): Promise { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'freebuff-e2e-')) + + // Create a minimal project so freebuff has something to work with + fs.writeFileSync( + path.join(tmpDir, 'README.md'), + '# E2E Test Project\n', + 'utf-8', + ) + + // Write any initial files before starting the binary + if (options?.initialFiles) { + for (const [relativePath, content] of Object.entries(options.initialFiles)) { + const filePath = path.join(tmpDir, relativePath) + const dir = path.dirname(filePath) + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }) + } + fs.writeFileSync(filePath, content, 'utf-8') + } + } + + const command = `cd '${tmpDir}' && '${binaryPath}'` + const sessionName = tmuxStart({ + command, + waitSeconds: options?.waitSeconds ?? 4, + width: options?.width ?? 120, + height: options?.height ?? 30, + }) + + return new FreebuffSession(sessionName, tmpDir) + } + + /** Write a file into the session's working directory. */ + writeFile(relativePath: string, content: string): void { + const filePath = path.join(this.workDir, relativePath) + const dir = path.dirname(filePath) + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }) + } + fs.writeFileSync(filePath, content, 'utf-8') + } + + /** Read a file from the session's working directory. */ + readFile(relativePath: string): string { + return fs.readFileSync(path.join(this.workDir, relativePath), 'utf-8') + } + + /** Check if a file exists in the session's working directory. */ + fileExists(relativePath: string): boolean { + return fs.existsSync(path.join(this.workDir, relativePath)) + } + + /** + * Poll until a file in the working directory contains the given text. + * Throws if the timeout is exceeded. + */ + async waitForFileContent( + relativePath: string, + pattern: string, + timeoutMs = 60_000, + ): Promise { + const start = Date.now() + while (Date.now() - start < timeoutMs) { + try { + const content = this.readFile(relativePath) + if (content.includes(pattern)) return content + } catch { + // File may not exist yet + } + await new Promise((resolve) => setTimeout(resolve, 1_000)) + } + let finalContent = '(file does not exist)' + try { + finalContent = this.readFile(relativePath) + } catch { + // ignore + } + throw new Error( + `Timed out after ${timeoutMs}ms waiting for "${pattern}" in ${relativePath}.\n` + + `Last content:\n${finalContent}`, + ) + } + + /** Send text input to the freebuff CLI (presses Enter by default). */ + async send( + text: string, + options?: { noEnter?: boolean; waitIdle?: number }, + ): Promise { + tmuxSend(this.name, text, { ...options, force: true }) + } + + /** Send a special key (e.g. Escape, C-c, Enter). */ + async sendKey(key: string): Promise { + tmuxSendKey(this.name, key) + } + + /** Capture current terminal output, optionally waiting first. */ + async capture(waitSeconds?: number): Promise { + return tmuxCapture(this.name, { waitSeconds, noSave: true }) + } + + /** Capture and auto-save to the session logs directory with a label. */ + async captureLabeled(label: string, waitSeconds?: number): Promise { + return tmuxCapture(this.name, { waitSeconds, label }) + } + + /** + * Poll until the terminal output contains the given text. + * Throws if the timeout is exceeded. + */ + async waitForText(pattern: string, timeoutMs = 30_000): Promise { + const start = Date.now() + while (Date.now() - start < timeoutMs) { + const output = await this.capture() + if (output.includes(pattern)) return output + await new Promise((resolve) => setTimeout(resolve, 500)) + } + const finalOutput = await this.capture() + throw new Error( + `Timed out after ${timeoutMs}ms waiting for "${pattern}".\n` + + `Last output:\n${finalOutput}`, + ) + } + + /** Stop the tmux session and clean up the temp directory. */ + async stop(): Promise { + tmuxStop(this.name) + try { + fs.rmSync(this.workDir, { recursive: true, force: true }) + } catch { + // Ignore cleanup errors + } + } +} diff --git a/freebuff/e2e/utils/index.ts b/freebuff/e2e/utils/index.ts new file mode 100644 index 0000000000..6927a4abd4 --- /dev/null +++ b/freebuff/e2e/utils/index.ts @@ -0,0 +1,10 @@ +export { getFreebuffBinaryPath, requireFreebuffBinary, REPO_ROOT } from './binary-helpers' +export { FreebuffSession } from './freebuff-session' +export { createFreebuffTmuxTools } from './tmux-custom-tools' +export { + tmuxStart, + tmuxSend, + tmuxSendKey, + tmuxCapture, + tmuxStop, +} from './tmux-helpers' diff --git a/freebuff/e2e/utils/tmux-custom-tools.ts b/freebuff/e2e/utils/tmux-custom-tools.ts new file mode 100644 index 0000000000..92af618934 --- /dev/null +++ b/freebuff/e2e/utils/tmux-custom-tools.ts @@ -0,0 +1,155 @@ +import { z } from 'zod/v4' + +import { FreebuffSession } from './freebuff-session' + +import type { ZodType } from 'zod/v4' + +interface FreebuffToolDefinition { + toolName: string + description: string + inputSchema: ZodType + endsAgentStep: boolean + exampleInputs: Record[] + execute: (input: Record) => Promise +} + +type ToolOutput = { type: 'json'; value: Record }[] + +/** + * Creates custom tool definitions that allow a Codebuff SDK agent + * to interact with a Freebuff CLI binary via tmux. + * + * Returns the tools array and a cleanup function to call in afterEach. + * + * Usage: + * ```ts + * const { tools, cleanup } = createFreebuffTmuxTools(binaryPath) + * // ... pass tools to client.run({ customToolDefinitions: tools }) + * // ... in afterEach: await cleanup() + * ``` + */ +export function createFreebuffTmuxTools(binaryPath: string): { + tools: FreebuffToolDefinition[] + cleanup: () => Promise +} { + let session: FreebuffSession | null = null + + const startTool: FreebuffToolDefinition = { + toolName: 'start_freebuff', + description: + 'Start the Freebuff CLI binary in a tmux terminal session. Call this first before interacting with Freebuff.', + inputSchema: z.object({}), + endsAgentStep: true, + exampleInputs: [{}], + execute: async (): Promise => { + if (session) { + return [ + { + type: 'json', + value: { + error: 'Session already running', + sessionName: session.name, + }, + }, + ] + } + session = await FreebuffSession.start(binaryPath) + const initialOutput = await session.capture(2) + return [ + { + type: 'json', + value: { + started: true, + sessionName: session.name, + initialOutput, + }, + }, + ] + }, + } + + const sendInputTool: FreebuffToolDefinition = { + toolName: 'send_to_freebuff', + description: + 'Send text input to the running Freebuff CLI. The text is sent as if typed by the user and Enter is pressed.', + inputSchema: z.object({ + text: z.string().describe('Text to send to Freebuff'), + }), + endsAgentStep: false, + exampleInputs: [{ text: '/help' }], + execute: async (input): Promise => { + const text = (input as { text: string }).text + if (!session) { + return [ + { + type: 'json', + value: { error: 'No session running. Call start_freebuff first.' }, + }, + ] + } + await session.send(text) + return [{ type: 'json', value: { sent: true, text } }] + }, + } + + const captureOutputTool: FreebuffToolDefinition = { + toolName: 'capture_freebuff_output', + description: + 'Capture the current terminal output from the running Freebuff CLI session. ' + + 'Use waitSeconds to wait before capturing (useful after sending a command).', + inputSchema: z.object({ + waitSeconds: z + .number() + .optional() + .describe('Seconds to wait before capturing (default: 0)'), + }), + endsAgentStep: true, + exampleInputs: [{ waitSeconds: 2 }], + execute: async (input): Promise => { + const waitSeconds = (input as { waitSeconds?: number }).waitSeconds + if (!session) { + return [ + { + type: 'json', + value: { error: 'No session running. Call start_freebuff first.' }, + }, + ] + } + const output = await session.capture(waitSeconds) + return [{ type: 'json', value: { output } }] + }, + } + + const stopTool: FreebuffToolDefinition = { + toolName: 'stop_freebuff', + description: + 'Stop the running Freebuff CLI session and clean up resources. Always call this when done testing.', + inputSchema: z.object({}), + endsAgentStep: true, + exampleInputs: [{}], + execute: async (): Promise => { + if (!session) { + return [ + { type: 'json', value: { stopped: true, wasRunning: false } }, + ] + } + await session.stop() + session = null + return [ + { type: 'json', value: { stopped: true, wasRunning: true } }, + ] + }, + } + + const cleanup = async () => { + if (session) { + await session.stop() + session = null + } + } + + return { + tools: [startTool, sendInputTool, captureOutputTool, stopTool], + cleanup, + } +} diff --git a/freebuff/e2e/utils/tmux-helpers.ts b/freebuff/e2e/utils/tmux-helpers.ts new file mode 100644 index 0000000000..40999a3360 --- /dev/null +++ b/freebuff/e2e/utils/tmux-helpers.ts @@ -0,0 +1,83 @@ +import { execFileSync } from 'child_process' + +import { REPO_ROOT } from './binary-helpers' + +const SCRIPTS_DIR = `${REPO_ROOT}/scripts/tmux` + +const EXEC_OPTIONS = { encoding: 'utf-8' as const, cwd: REPO_ROOT } + +export interface TmuxStartOptions { + command: string + name?: string + width?: number + height?: number + waitSeconds?: number +} + +export function tmuxStart(options: TmuxStartOptions): string { + const args: string[] = [ + `${SCRIPTS_DIR}/tmux-start.sh`, + '--command', + options.command, + '--plain', + ] + if (options.name) args.push('--name', options.name) + if (options.width) args.push('--width', String(options.width)) + if (options.height) args.push('--height', String(options.height)) + if (options.waitSeconds !== undefined) + args.push('--wait', String(options.waitSeconds)) + + return execFileSync('bash', args, EXEC_OPTIONS).trim() +} + +export function tmuxSend( + sessionName: string, + text: string, + options?: { noEnter?: boolean; waitIdle?: number; force?: boolean }, +): void { + const args: string[] = [ + `${SCRIPTS_DIR}/tmux-send.sh`, + sessionName, + text, + ] + if (options?.noEnter) args.push('--no-enter') + if (options?.waitIdle) args.push('--wait-idle', String(options.waitIdle)) + if (options?.force) args.push('--force') + + execFileSync('bash', args, EXEC_OPTIONS) +} + +export function tmuxSendKey(sessionName: string, key: string): void { + execFileSync( + 'bash', + [`${SCRIPTS_DIR}/tmux-send.sh`, sessionName, '--key', key], + EXEC_OPTIONS, + ) +} + +export function tmuxCapture( + sessionName: string, + options?: { waitSeconds?: number; label?: string; noSave?: boolean }, +): string { + const args: string[] = [`${SCRIPTS_DIR}/tmux-capture.sh`, sessionName] + if (options?.waitSeconds) args.push('--wait', String(options.waitSeconds)) + if (options?.label) args.push('--label', options.label) + if (options?.noSave) args.push('--no-save') + + return execFileSync('bash', args, { + ...EXEC_OPTIONS, + stdio: ['pipe', 'pipe', 'pipe'], + }) +} + +export function tmuxStop(sessionName: string): void { + try { + execFileSync( + 'bash', + [`${SCRIPTS_DIR}/tmux-stop.sh`, sessionName], + EXEC_OPTIONS, + ) + } catch { + // tmux-stop.sh is idempotent; ignore errors if session already gone + } +} diff --git a/freebuff/package.json b/freebuff/package.json index 286a863793..03fb9d35e4 100644 --- a/freebuff/package.json +++ b/freebuff/package.json @@ -3,6 +3,17 @@ "version": "1.0.0", "private": true, "scripts": { - "release": "bun cli/release.ts" + "release": "bun cli/release.ts", + "build:binary": "bun cli/build.ts 0.0.0-dev", + "e2e": "bun test e2e/tests/", + "e2e:version": "bun test e2e/tests/version.e2e.test.ts", + "e2e:startup": "bun test e2e/tests/startup.e2e.test.ts", + "e2e:help": "bun test e2e/tests/help-command.e2e.test.ts", + "e2e:slash-commands": "bun test e2e/tests/slash-commands.e2e.test.ts", + "e2e:mode": "bun test e2e/tests/mode-restriction.e2e.test.ts", + "e2e:ads": "bun test e2e/tests/ads-behavior.e2e.test.ts", + "e2e:agent": "bun test e2e/tests/agent-startup.e2e.test.ts", + "e2e:code-edit": "bun test e2e/tests/code-edit.e2e.test.ts", + "e2e:terminal-command": "bun test e2e/tests/terminal-command.e2e.test.ts" } } diff --git a/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts b/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts index 3f0ab73d4a..63ddf60d24 100644 --- a/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts +++ b/packages/agent-runtime/src/__tests__/loop-agent-steps.test.ts @@ -20,6 +20,7 @@ import { mock, spyOn, } from 'bun:test' +import { APICallError } from 'ai' import { z } from 'zod/v4' import { loopAgentSteps } from '../run-agent-step' @@ -931,4 +932,89 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () => expect(llmCallCount).toBe(0) }) }) + + describe('API error handling', () => { + it('should propagate error code and server message from 403 APICallError responseBody', async () => { + const llmOnlyTemplate = { + ...mockTemplate, + handleSteps: undefined, + } + + const localAgentTemplates = { + 'test-agent': llmOnlyTemplate, + } + + // Mock promptAiSdkStream to throw an APICallError with a 403 status + // and a responseBody containing the server's structured error + loopAgentStepsBaseParams.promptAiSdkStream = async function* () { + throw new APICallError({ + statusCode: 403, + message: 'Forbidden', + url: 'https://api.codebuff.com/v1/chat/completions', + requestBodyValues: {}, + responseBody: JSON.stringify({ + error: 'free_mode_unavailable', + message: 'Free mode is not available in your country.', + }), + isRetryable: false, + }) + } + + const result = await loopAgentSteps({ + ...loopAgentStepsBaseParams, + agentType: 'test-agent', + localAgentTemplates, + }) + + expect(result.output.type).toBe('error') + if (result.output.type === 'error') { + // Should use the server's message, NOT the generic "Forbidden" + expect(result.output.message).toBe('Free mode is not available in your country.') + // Should NOT have the 'Agent run error: ' prefix since message came from responseBody + expect(result.output.message).not.toContain('Agent run error:') + // Should propagate the error code so the CLI can match on it + expect(result.output.error).toBe('free_mode_unavailable') + // Should propagate the status code + expect(result.output.statusCode).toBe(403) + } + }) + + it('should prefix with "Agent run error:" when responseBody has no parseable message', async () => { + const llmOnlyTemplate = { + ...mockTemplate, + handleSteps: undefined, + } + + const localAgentTemplates = { + 'test-agent': llmOnlyTemplate, + } + + // APICallError with no responseBody + loopAgentStepsBaseParams.promptAiSdkStream = async function* () { + throw new APICallError({ + statusCode: 500, + message: 'Internal Server Error', + url: 'https://api.codebuff.com/v1/chat/completions', + requestBodyValues: {}, + responseBody: undefined, + isRetryable: true, + }) + } + + const result = await loopAgentSteps({ + ...loopAgentStepsBaseParams, + agentType: 'test-agent', + localAgentTemplates, + }) + + expect(result.output.type).toBe('error') + if (result.output.type === 'error') { + // Should have the prefix since there's no server message + expect(result.output.message).toContain('Agent run error:') + expect(result.output.message).toContain('Internal Server Error') + // No error code since responseBody wasn't parseable + expect(result.output.error).toBeUndefined() + } + }) + }) }) diff --git a/packages/agent-runtime/src/run-agent-step.ts b/packages/agent-runtime/src/run-agent-step.ts index b323d5f0f5..992db72aa7 100644 --- a/packages/agent-runtime/src/run-agent-step.ts +++ b/packages/agent-runtime/src/run-agent-step.ts @@ -2,7 +2,7 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' import { supportsCacheControl } from '@codebuff/common/old-constants' import { TOOLS_WHICH_WONT_FORCE_NEXT_STEP } from '@codebuff/common/tools/constants' import { buildArray } from '@codebuff/common/util/array' -import { AbortError, getErrorObject, isAbortError } from '@codebuff/common/util/error' +import { AbortError, getErrorObject, isAbortError, parseApiErrorResponseBody } from '@codebuff/common/util/error' import { serializeCacheDebugCorrelation } from '@codebuff/common/util/cache-debug' import { systemMessage, userMessage } from '@codebuff/common/util/messages' import { APICallError, type ToolSet } from 'ai' @@ -1069,8 +1069,16 @@ export async function loopAgentSteps( ) let errorMessage = '' + let errorCode: string | undefined + let hasServerMessage = false if (error instanceof APICallError) { errorMessage = `${error.message}` + const parsed = parseApiErrorResponseBody(error.responseBody) + if (parsed.errorCode) errorCode = parsed.errorCode + if (parsed.message) { + errorMessage = parsed.message + hasServerMessage = true + } } else { // Extract clean error message (just the message, not name:message format) errorMessage = @@ -1101,8 +1109,9 @@ export async function loopAgentSteps( agentState: currentAgentState, output: { type: 'error', - message: 'Agent run error: ' + errorMessage, + message: hasServerMessage ? errorMessage : 'Agent run error: ' + errorMessage, ...(statusCode !== undefined && { statusCode }), + ...(errorCode !== undefined && { error: errorCode }), }, } } diff --git a/sdk/src/run.ts b/sdk/src/run.ts index 13b6562624..f0d150ca01 100644 --- a/sdk/src/run.ts +++ b/sdk/src/run.ts @@ -15,6 +15,7 @@ import { import { toolNames } from '@codebuff/common/tools/constants' import { clientToolCallSchema } from '@codebuff/common/tools/list' import { AgentOutputSchema } from '@codebuff/common/types/session-state' +import { parseApiErrorResponseBody } from '@codebuff/common/util/error' import { cloneDeep } from 'lodash' import { getErrorStatusCode } from './error-utils' @@ -516,25 +517,13 @@ async function runOnce({ // Extract structured error details from the API response body // (e.g., AI SDK's AI_APICallError includes a responseBody with the server's JSON response) - let errorCode: string | undefined const responseBody = error && typeof error === 'object' && 'responseBody' in error ? (error as { responseBody: unknown }).responseBody : undefined - if (typeof responseBody === 'string') { - try { - const parsed: unknown = JSON.parse(responseBody) - if (parsed && typeof parsed === 'object') { - if ('error' in parsed && typeof (parsed as { error: unknown }).error === 'string') { - errorCode = (parsed as { error: string }).error - } - if ('message' in parsed && typeof (parsed as { message: unknown }).message === 'string') { - errorMessage = (parsed as { message: string }).message - } - } - } catch { - // responseBody wasn't valid JSON; keep original errorMessage - } + const { errorCode, message: parsedMessage } = parseApiErrorResponseBody(responseBody) + if (parsedMessage) { + errorMessage = parsedMessage } resolve({