agent: pass adaptive thinking config to SDK query() (ghostwright#104)

mcheemaa · phantom · commit 3115fbd7fc6d · 2026-05-14T23:28:54.000Z
* agent: pass adaptive thinking config to SDK query() Opus 4.7 (and likely future models) reject the legacy `thinking.type: enabled` shape in API requests. Without an explicit `thinking` option on the SDK query(), the bundled CLI falls back to that legacy shape and the API returns a 400: invalid_request_error: "thinking.type.enabled" is not supported for this model. Use "thinking.type.adaptive" and "output_config.effort" to control thinking behavior. Pass `thinking: { type: "adaptive" }` from chat-query.ts and judge-query.ts so the SDK forwards the supported shape on every query, regardless of which Opus or Sonnet variant is configured. The existing `effort` option continues to control thinking depth under the adaptive contract. * agent: pass adaptive thinking on AgentRuntime query (Codex P1) Round 1 of ghostwright#104 review caught the runtime.ts handleMessage call site was untouched, leaving inbound Slack/trigger/scheduler/MCP requests on Opus 4.7 still defaulting to the legacy thinking.type.enabled shape that returns 400 invalid_request_error. Mirror the explicit adaptive shape on this third call site so every entry point that reaches the SDK query() forwards the supported request shape. Also patch reflection-subprocess.ts, the fourth query() site, which runs on the same Opus tier during memory drains and would 400 the same way without the adaptive thinking option. * agent: model-aware thinking config (Codex round 2 P1) Round 2 found unconditional adaptive thinking breaks Haiku 4.5 in the reflection subprocess (and any future haiku-tier callsite). Adaptive is supported on Sonnet 4.6 and Opus 4.7; Haiku 4.5 needs the legacy enabled + budget_tokens shape. The fix replaces the four scattered adaptive-stamp lines with a single-source-of-truth getThinkingConfig(model) helper so every call site picks the right shape based on its model. Verified end-to-end against the live Anthropic API on opus-4-7, sonnet-4-6, and haiku-4-5: opus rejects enabled (400), haiku rejects adaptive (400), sonnet accepts both. The helper maps each model family to the shape the API actually accepts, defaulting unknown models to adaptive because every model since Opus 4.7 has been adaptive-only. Helper is covered by 12 new unit tests against the full matrix; chat, judge, runtime, and reflection callsites now spread it instead of hard-coding a literal.
diff --git a/src/agent/__tests__/thinking-config.test.ts b/src/agent/__tests__/thinking-config.test.ts
@@ -0,0 +1,87 @@
+// Coverage for every cell of the model x thinking-shape matrix the live
+// Messages API enforces (verified 2026-04-29). The helper is the single
+// source of truth for SDK `query()` thinking options across chat, judge,
+// runtime, and reflection callsites.
+
+import { describe, expect, test } from "bun:test";
+import { JUDGE_MODEL_HAIKU, JUDGE_MODEL_OPUS, JUDGE_MODEL_SONNET } from "../../evolution/judge-models.ts";
+import { getThinkingConfig } from "../thinking-config.ts";
+
+describe("getThinkingConfig", () => {
+	test("Opus 4.7 returns adaptive (Opus 4.7 rejects manual enabled with 400)", () => {
+		expect(getThinkingConfig(JUDGE_MODEL_OPUS)).toEqual({ type: "adaptive" });
+		expect(getThinkingConfig("claude-opus-4-7")).toEqual({ type: "adaptive" });
+	});
+
+	test("Opus 4.6 returns adaptive (recommended; manual is deprecated)", () => {
+		expect(getThinkingConfig("claude-opus-4-6")).toEqual({ type: "adaptive" });
+	});
+
+	test("Sonnet 4.6 returns adaptive (recommended; manual still functional)", () => {
+		expect(getThinkingConfig(JUDGE_MODEL_SONNET)).toEqual({ type: "adaptive" });
+		expect(getThinkingConfig("claude-sonnet-4-6")).toEqual({ type: "adaptive" });
+	});
+
+	test("Mythos preview returns adaptive", () => {
+		expect(getThinkingConfig("claude-mythos-preview")).toEqual({ type: "adaptive" });
+	});
+
+	test("Haiku 4.5 returns enabled + budgetTokens (Haiku rejects adaptive with 400)", () => {
+		const config = getThinkingConfig(JUDGE_MODEL_HAIKU);
+		expect(config.type).toBe("enabled");
+		if (config.type === "enabled") {
+			expect(config.budgetTokens).toBeGreaterThan(0);
+		}
+	});
+
+	test("older Haiku 3.x returns enabled + budgetTokens", () => {
+		const config = getThinkingConfig("claude-haiku-3-5");
+		expect(config.type).toBe("enabled");
+	});
+
+	test("older Sonnet 3.x returns enabled + budgetTokens", () => {
+		const config = getThinkingConfig("claude-sonnet-3-7");
+		expect(config.type).toBe("enabled");
+	});
+
+	test("legacy Opus 4.5 returns enabled + budgetTokens", () => {
+		const config = getThinkingConfig("claude-opus-4-5");
+		expect(config.type).toBe("enabled");
+	});
+
+	test("undefined model defaults to adaptive (safe for all new models)", () => {
+		expect(getThinkingConfig(undefined)).toEqual({ type: "adaptive" });
+	});
+
+	test("null model defaults to adaptive", () => {
+		expect(getThinkingConfig(null)).toEqual({ type: "adaptive" });
+	});
+
+	test("empty string defaults to adaptive", () => {
+		expect(getThinkingConfig("")).toEqual({ type: "adaptive" });
+	});
+
+	test("unknown future model defaults to adaptive", () => {
+		// Every new model since Opus 4.7 has been adaptive-only, so when
+		// we do not recognise the prefix we send adaptive. A wrong guess
+		// returns a clear 400 with the required shape, which is preferable
+		// to silent breakage in reflection.
+		expect(getThinkingConfig("claude-future-model-2027")).toEqual({ type: "adaptive" });
+	});
+
+	test("provider-prefixed names still match by suffix-free comparison fail-safe", () => {
+		// Some operators set `model: "anthropic/claude-haiku-4-5"` via
+		// LiteLLM. The helper currently does prefix-match on the bare
+		// Anthropic id. If a slash-prefix is used, we fall through to
+		// adaptive default, which is the safer of the two failure modes
+		// (adaptive will 400 with a clear error on Haiku rather than
+		// silently downgrading thinking).
+		expect(getThinkingConfig("anthropic/claude-haiku-4-5")).toEqual({ type: "adaptive" });
+	});
+
+	test("returned object is a fresh value (callers may spread it)", () => {
+		const a = getThinkingConfig(JUDGE_MODEL_OPUS);
+		const b = getThinkingConfig(JUDGE_MODEL_OPUS);
+		expect(a).toEqual(b);
+	});
+});
diff --git a/src/agent/chat-query.ts b/src/agent/chat-query.ts
@@ -18,6 +18,7 @@ import { extractCost, extractTextFromMessage } from "./message-utils.ts";
 import { permissionOptionsFromConfig } from "./permission-options.ts";
 import { assemblePrompt } from "./prompt-assembler.ts";
 import type { Session, SessionStore } from "./session-store.ts";
+import { getThinkingConfig } from "./thinking-config.ts";
 
 export type ChatQueryDeps = {
 	config: PhantomConfig;
@@ -106,6 +107,7 @@ export async function executeChatQuery(
 				},
 				persistSession: true,
 				effort: deps.config.effort,
+				thinking: getThinkingConfig(deps.config.model),
 				includePartialMessages: true,
 				agentProgressSummaries: true,
 				promptSuggestions: true,
diff --git a/src/agent/judge-query.ts b/src/agent/judge-query.ts
@@ -3,6 +3,7 @@ import { z } from "zod/v4";
 import { buildProviderEnv } from "../config/providers.ts";
 import type { PhantomConfig } from "../config/types.ts";
 import { extractTextFromMessage } from "./message-utils.ts";
+import { getThinkingConfig } from "./thinking-config.ts";
 
 // Judge subprocess integration. Routes LLM judge calls through the same
 // Agent SDK `query()` subprocess as the main agent so that auth, provider,
@@ -164,6 +165,7 @@ export async function runJudgeQuery<T>(
 			systemPrompt,
 			maxTurns: 1,
 			effort: "low",
+			thinking: getThinkingConfig(resolvedModel),
 			persistSession: false,
 			env: { ...process.env, ...providerEnv },
 		},
diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts
@@ -19,6 +19,7 @@ import { extractCost, extractTextFromMessage } from "./message-utils.ts";
 import { permissionOptionsFromConfig } from "./permission-options.ts";
 import { assemblePrompt } from "./prompt-assembler.ts";
 import { SessionStore } from "./session-store.ts";
+import { getThinkingConfig } from "./thinking-config.ts";
 
 export type RuntimeEvent =
 	| { type: "init"; sessionId: string }
@@ -206,6 +207,7 @@ export class AgentRuntime {
 					systemPrompt: { type: "preset" as const, preset: "claude_code" as const, append: appendPrompt },
 					persistSession: true,
 					effort: this.config.effort,
+					thinking: getThinkingConfig(this.config.model),
 					...(this.config.max_budget_usd > 0 ? { maxBudgetUsd: this.config.max_budget_usd } : {}),
 					abortController: controller,
 					env: { ...process.env, ...providerEnv },
diff --git a/src/agent/thinking-config.ts b/src/agent/thinking-config.ts
@@ -0,0 +1,52 @@
+// Single-source-of-truth picker for the Agent SDK `thinking` option.
+//
+// The matrix is non-uniform across models:
+//   - Opus 4.7 only accepts `{ type: "adaptive" }`. Manual `enabled +
+//     budget_tokens` is rejected with a 400.
+//   - Haiku 4.5 only accepts `{ type: "enabled", budget_tokens: N }`.
+//     Adaptive is rejected with a 400.
+//   - Sonnet 4.6 accepts both shapes (manual is deprecated but still
+//     functional).
+//
+// Verified against the live Messages API on 2026-04-29; see the design
+// note at local/2026-04-29-thinking-config-design.md (local-only).
+//
+// Every SDK `query()` callsite spreads `getThinkingConfig(model)` instead
+// of hard-coding a single shape, so reflection (Haiku tier), chat (Opus
+// tier), judges (Sonnet tier), and the AgentRuntime path all pick the
+// correct shape. New models default to adaptive because every model
+// Anthropic has shipped since 4.7 only accepts adaptive.
+
+import type { ThinkingConfig } from "@anthropic-ai/claude-agent-sdk";
+
+const ADAPTIVE_PREFIXES: readonly string[] = [
+	"claude-opus-4-7",
+	"claude-opus-4-6",
+	"claude-sonnet-4-6",
+	"claude-mythos",
+];
+
+const MANUAL_ONLY_PREFIXES: readonly string[] = [
+	"claude-haiku-4",
+	"claude-haiku-3",
+	"claude-sonnet-3",
+	"claude-sonnet-4-5",
+	"claude-opus-4-5",
+];
+
+const MANUAL_BUDGET_TOKENS = 8192;
+
+export function getThinkingConfig(model: string | undefined | null): ThinkingConfig {
+	if (!model) return { type: "adaptive" };
+	if (ADAPTIVE_PREFIXES.some((p) => model.startsWith(p))) {
+		return { type: "adaptive" };
+	}
+	if (MANUAL_ONLY_PREFIXES.some((p) => model.startsWith(p))) {
+		return { type: "enabled", budgetTokens: MANUAL_BUDGET_TOKENS };
+	}
+	// Unknown model: prefer adaptive. Every model Anthropic has released
+	// since Opus 4.7 only accepts adaptive, so a new variant is far more
+	// likely to require adaptive than to require manual mode. If wrong,
+	// the API returns a clear 400 error with the required shape.
+	return { type: "adaptive" };
+}
diff --git a/src/evolution/reflection-subprocess.ts b/src/evolution/reflection-subprocess.ts
@@ -1,6 +1,7 @@
 import { appendFileSync, existsSync, mkdirSync, readdirSync, unlinkSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { query } from "@anthropic-ai/claude-agent-sdk";
+import { getThinkingConfig } from "../agent/thinking-config.ts";
 import { buildProviderEnv } from "../config/providers.ts";
 import type { PhantomConfig } from "../config/types.ts";
 import type { EvolutionConfig } from "./config.ts";
@@ -647,6 +648,7 @@ async function defaultRunner(input: SpawnQueryInput): Promise<SpawnQueryResult>
 				permissionMode: "bypassPermissions",
 				allowDangerouslySkipPermissions: true,
 				tools: ["Read", "Write", "Edit", "Glob", "Grep"],
+				thinking: getThinkingConfig(model),
 				systemPrompt,
 				settings: {
 					permissions: { allow, deny },