Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions src/agent/__tests__/thinking-config.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// Coverage for every cell of the model x thinking-shape matrix the live
// Messages API enforces (verified 2026-04-29). The helper is the single
// source of truth for SDK `query()` thinking options across chat, judge,
// runtime, and reflection callsites.

import { describe, expect, test } from "bun:test";
import { JUDGE_MODEL_HAIKU, JUDGE_MODEL_OPUS, JUDGE_MODEL_SONNET } from "../../evolution/judge-models.ts";
import { getThinkingConfig } from "../thinking-config.ts";

describe("getThinkingConfig", () => {
test("Opus 4.7 returns adaptive (Opus 4.7 rejects manual enabled with 400)", () => {
expect(getThinkingConfig(JUDGE_MODEL_OPUS)).toEqual({ type: "adaptive" });
expect(getThinkingConfig("claude-opus-4-7")).toEqual({ type: "adaptive" });
});

test("Opus 4.6 returns adaptive (recommended; manual is deprecated)", () => {
expect(getThinkingConfig("claude-opus-4-6")).toEqual({ type: "adaptive" });
});

test("Sonnet 4.6 returns adaptive (recommended; manual still functional)", () => {
expect(getThinkingConfig(JUDGE_MODEL_SONNET)).toEqual({ type: "adaptive" });
expect(getThinkingConfig("claude-sonnet-4-6")).toEqual({ type: "adaptive" });
});

test("Mythos preview returns adaptive", () => {
expect(getThinkingConfig("claude-mythos-preview")).toEqual({ type: "adaptive" });
});

test("Haiku 4.5 returns enabled + budgetTokens (Haiku rejects adaptive with 400)", () => {
const config = getThinkingConfig(JUDGE_MODEL_HAIKU);
expect(config.type).toBe("enabled");
if (config.type === "enabled") {
expect(config.budgetTokens).toBeGreaterThan(0);
}
});

test("older Haiku 3.x returns enabled + budgetTokens", () => {
const config = getThinkingConfig("claude-haiku-3-5");
expect(config.type).toBe("enabled");
});

test("older Sonnet 3.x returns enabled + budgetTokens", () => {
const config = getThinkingConfig("claude-sonnet-3-7");
expect(config.type).toBe("enabled");
});

test("legacy Opus 4.5 returns enabled + budgetTokens", () => {
const config = getThinkingConfig("claude-opus-4-5");
expect(config.type).toBe("enabled");
});

test("undefined model defaults to adaptive (safe for all new models)", () => {
expect(getThinkingConfig(undefined)).toEqual({ type: "adaptive" });
});

test("null model defaults to adaptive", () => {
expect(getThinkingConfig(null)).toEqual({ type: "adaptive" });
});

test("empty string defaults to adaptive", () => {
expect(getThinkingConfig("")).toEqual({ type: "adaptive" });
});

test("unknown future model defaults to adaptive", () => {
// Every new model since Opus 4.7 has been adaptive-only, so when
// we do not recognise the prefix we send adaptive. A wrong guess
// returns a clear 400 with the required shape, which is preferable
// to silent breakage in reflection.
expect(getThinkingConfig("claude-future-model-2027")).toEqual({ type: "adaptive" });
});

test("provider-prefixed names still match by suffix-free comparison fail-safe", () => {
// Some operators set `model: "anthropic/claude-haiku-4-5"` via
// LiteLLM. The helper currently does prefix-match on the bare
// Anthropic id. If a slash-prefix is used, we fall through to
// adaptive default, which is the safer of the two failure modes
// (adaptive will 400 with a clear error on Haiku rather than
// silently downgrading thinking).
expect(getThinkingConfig("anthropic/claude-haiku-4-5")).toEqual({ type: "adaptive" });
});

test("returned object is a fresh value (callers may spread it)", () => {
const a = getThinkingConfig(JUDGE_MODEL_OPUS);
const b = getThinkingConfig(JUDGE_MODEL_OPUS);
expect(a).toEqual(b);
});
});
2 changes: 2 additions & 0 deletions src/agent/chat-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { extractCost, extractTextFromMessage } from "./message-utils.ts";
import { permissionOptionsFromConfig } from "./permission-options.ts";
import { assemblePrompt } from "./prompt-assembler.ts";
import type { Session, SessionStore } from "./session-store.ts";
import { getThinkingConfig } from "./thinking-config.ts";

export type ChatQueryDeps = {
config: PhantomConfig;
Expand Down Expand Up @@ -106,6 +107,7 @@ export async function executeChatQuery(
},
persistSession: true,
effort: deps.config.effort,
thinking: getThinkingConfig(deps.config.model),
includePartialMessages: true,
agentProgressSummaries: true,
promptSuggestions: true,
Expand Down
2 changes: 2 additions & 0 deletions src/agent/judge-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { z } from "zod/v4";
import { buildProviderEnv } from "../config/providers.ts";
import type { PhantomConfig } from "../config/types.ts";
import { extractTextFromMessage } from "./message-utils.ts";
import { getThinkingConfig } from "./thinking-config.ts";

// Judge subprocess integration. Routes LLM judge calls through the same
// Agent SDK `query()` subprocess as the main agent so that auth, provider,
Expand Down Expand Up @@ -164,6 +165,7 @@ export async function runJudgeQuery<T>(
systemPrompt,
maxTurns: 1,
effort: "low",
thinking: getThinkingConfig(resolvedModel),
persistSession: false,
env: { ...process.env, ...providerEnv },
},
Expand Down
2 changes: 2 additions & 0 deletions src/agent/runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { extractCost, extractTextFromMessage } from "./message-utils.ts";
import { permissionOptionsFromConfig } from "./permission-options.ts";
import { assemblePrompt } from "./prompt-assembler.ts";
import { SessionStore } from "./session-store.ts";
import { getThinkingConfig } from "./thinking-config.ts";

export type RuntimeEvent =
| { type: "init"; sessionId: string }
Expand Down Expand Up @@ -206,6 +207,7 @@ export class AgentRuntime {
systemPrompt: { type: "preset" as const, preset: "claude_code" as const, append: appendPrompt },
persistSession: true,
effort: this.config.effort,
thinking: getThinkingConfig(this.config.model),
...(this.config.max_budget_usd > 0 ? { maxBudgetUsd: this.config.max_budget_usd } : {}),
abortController: controller,
env: { ...process.env, ...providerEnv },
Expand Down
52 changes: 52 additions & 0 deletions src/agent/thinking-config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Single-source-of-truth picker for the Agent SDK `thinking` option.
//
// The matrix is non-uniform across models:
// - Opus 4.7 only accepts `{ type: "adaptive" }`. Manual `enabled +
// budget_tokens` is rejected with a 400.
// - Haiku 4.5 only accepts `{ type: "enabled", budget_tokens: N }`.
// Adaptive is rejected with a 400.
// - Sonnet 4.6 accepts both shapes (manual is deprecated but still
// functional).
//
// Verified against the live Messages API on 2026-04-29; see the design
// note at local/2026-04-29-thinking-config-design.md (local-only).
//
// Every SDK `query()` callsite spreads `getThinkingConfig(model)` instead
// of hard-coding a single shape, so reflection (Haiku tier), chat (Opus
// tier), judges (Sonnet tier), and the AgentRuntime path all pick the
// correct shape. New models default to adaptive because every model
// Anthropic has shipped since 4.7 only accepts adaptive.

import type { ThinkingConfig } from "@anthropic-ai/claude-agent-sdk";

const ADAPTIVE_PREFIXES: readonly string[] = [
"claude-opus-4-7",
"claude-opus-4-6",
"claude-sonnet-4-6",
"claude-mythos",
];

const MANUAL_ONLY_PREFIXES: readonly string[] = [
"claude-haiku-4",
"claude-haiku-3",
"claude-sonnet-3",
"claude-sonnet-4-5",
Comment on lines +30 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Match Claude 3.x IDs when selecting manual thinking

getThinkingConfig only checks claude-haiku-3* / claude-sonnet-3* prefixes, but Anthropic’s actual 3.x IDs are claude-3-5-haiku-* and claude-3-7-sonnet-*; those values currently fall through to the adaptive default and will send thinking: { type: "adaptive" } to models that require manual thinking (enabled + budget), causing runtime 400s for valid legacy model configurations. This regression is easy to trigger by setting model to a real 3.x ID (e.g. claude-3-5-haiku-20241022).

Useful? React with 👍 / 👎.

"claude-opus-4-5",
];

const MANUAL_BUDGET_TOKENS = 8192;

export function getThinkingConfig(model: string | undefined | null): ThinkingConfig {
if (!model) return { type: "adaptive" };
if (ADAPTIVE_PREFIXES.some((p) => model.startsWith(p))) {
return { type: "adaptive" };
}
if (MANUAL_ONLY_PREFIXES.some((p) => model.startsWith(p))) {
return { type: "enabled", budgetTokens: MANUAL_BUDGET_TOKENS };
}
// Unknown model: prefer adaptive. Every model Anthropic has released
// since Opus 4.7 only accepts adaptive, so a new variant is far more
// likely to require adaptive than to require manual mode. If wrong,
// the API returns a clear 400 error with the required shape.
return { type: "adaptive" };
}
2 changes: 2 additions & 0 deletions src/evolution/reflection-subprocess.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { appendFileSync, existsSync, mkdirSync, readdirSync, unlinkSync, writeFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { query } from "@anthropic-ai/claude-agent-sdk";
import { getThinkingConfig } from "../agent/thinking-config.ts";
import { buildProviderEnv } from "../config/providers.ts";
import type { PhantomConfig } from "../config/types.ts";
import type { EvolutionConfig } from "./config.ts";
Expand Down Expand Up @@ -610,6 +611,7 @@ async function defaultRunner(input: SpawnQueryInput): Promise<SpawnQueryResult>
permissionMode: "bypassPermissions",
allowDangerouslySkipPermissions: true,
tools: ["Read", "Write", "Edit", "Glob", "Grep"],
thinking: getThinkingConfig(model),
systemPrompt,
settings: {
permissions: { allow, deny },
Expand Down
Loading