From 5db7d977ce0ec056614624c0538b45d8898fbd82 Mon Sep 17 00:00:00 2001 From: Aleks Petrov Date: Wed, 29 Apr 2026 22:25:51 +0200 Subject: [PATCH] =?UTF-8?q?feat(executor):=20Opus=20plans,=20Sonnet=20exec?= =?UTF-8?q?utes=20=E2=80=94=20cut=20Pilot=20token=20spend=20(GH-2432)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Routes Opus to where reasoning matters (epic planning) and Sonnet to verbose execution to address the 4× token spike in Pilot subprocess sessions. - ClaudeCodeConfig gains AllowedTools + MCPConfigPath; ExecuteOptions plumbs them through and the Claude Code args builder emits --allowedTools and --mcp-config when set. Default execution toolbox excludes MCPs (Read, Write, Edit, Bash, Grep, Glob, Task) to drop per-turn context bloat. - New PlanningConfig (default model claude-opus-4-7). PlanEpic now passes --model + --allowedTools=Read,Grep,Glob and exports ANTHROPIC_MODEL so Pilot's global Sonnet env doesn't override it on Node's last-write lookup. - model_routing.complex default flipped from claude-opus-4-6 to claude-sonnet-4-6 — Opus is reserved for planning only. - Retry counter persisted via labels (pilot-retry-1, pilot-retry-2, pilot-retry-exhausted). Survives `pilot start` restarts; previous in-memory retryReadyCount silently reset on restart and let pathological issues consume Opus indefinitely. Auto-merger strips retry labels on successful merge so a future regression starts fresh. - HooksConfig.RunTestsOnStop default flipped to false — Stop-hook tests forced long unproductive turns into the Claude session. --- internal/adapters/github/gh2432_test.go | 174 ++++++++++++++++++++++++ internal/adapters/github/poller.go | 78 +++++++++-- internal/adapters/github/poller_test.go | 10 +- internal/adapters/github/types.go | 10 ++ internal/autopilot/auto_merger.go | 12 ++ internal/executor/backend.go | 57 +++++++- internal/executor/backend_claudecode.go | 10 ++ internal/executor/epic.go | 20 ++- internal/executor/gh2432_test.go | 128 +++++++++++++++++ internal/executor/hooks.go | 6 +- internal/executor/hooks_test.go | 5 +- internal/executor/runner.go | 48 +++++-- 12 files changed, 524 insertions(+), 34 deletions(-) create mode 100644 internal/adapters/github/gh2432_test.go create mode 100644 internal/executor/gh2432_test.go diff --git a/internal/adapters/github/gh2432_test.go b/internal/adapters/github/gh2432_test.go new file mode 100644 index 00000000..d09d0520 --- /dev/null +++ b/internal/adapters/github/gh2432_test.go @@ -0,0 +1,174 @@ +package github + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/qf-studio/pilot/internal/testutil" +) + +// GH-2432: Retry counter is persisted via labels (pilot-retry-1, pilot-retry-2, +// pilot-retry-exhausted) so the budget survives `pilot start` restarts. + +// First retry on a fresh pilot-retry-ready issue should add pilot-retry-1. +func TestPoller_RetryLabel_FirstAttemptAddsRetry1(t *testing.T) { + now := time.Now() + issues := []*Issue{ + {Number: 42, State: "open", Title: "Retry-ready", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}}, CreatedAt: now.Add(-1 * time.Hour)}, + } + + var addedLabels atomic.Value + addedLabels.Store([]string(nil)) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/issues/42/labels"): + var body struct { + Labels []string `json:"labels"` + } + _ = json.NewDecoder(r.Body).Decode(&body) + prev, _ := addedLabels.Load().([]string) + addedLabels.Store(append(prev, body.Labels...)) + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`[]`)) + return + case r.Method == http.MethodDelete: + w.WriteHeader(http.StatusOK) + return + case r.URL.Path == "/search/issues": + _, _ = w.Write([]byte(`{"total_count": 0}`)) + return + } + _ = json.NewEncoder(w).Encode(issues) + })) + defer server.Close() + + client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL) + poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0)) + + issue, err := poller.findOldestUnprocessedIssue(context.Background()) + if err != nil { + t.Fatalf("findOldestUnprocessedIssue() error = %v", err) + } + if issue == nil || issue.Number != 42 { + t.Fatalf("expected #42 to be dispatched, got %v", issue) + } + + added, _ := addedLabels.Load().([]string) + foundRetry1 := false + for _, l := range added { + if l == LabelRetry1 { + foundRetry1 = true + } + } + if !foundRetry1 { + t.Errorf("expected pilot-retry-1 to be added, got labels=%v", added) + } +} + +// pilot-retry-2 + pilot-retry-ready → escalate to pilot-retry-exhausted, no dispatch. +func TestPoller_RetryLabel_Retry2EscalatesToExhausted(t *testing.T) { + now := time.Now() + issues := []*Issue{ + {Number: 42, State: "open", Title: "Retry-ready", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetry2}}, CreatedAt: now.Add(-1 * time.Hour)}, + {Number: 43, State: "open", Title: "Available", Labels: []Label{{Name: "pilot"}}, CreatedAt: now}, + } + + var addedLabels atomic.Value + addedLabels.Store([]string(nil)) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/issues/42/labels"): + var body struct { + Labels []string `json:"labels"` + } + _ = json.NewDecoder(r.Body).Decode(&body) + prev, _ := addedLabels.Load().([]string) + addedLabels.Store(append(prev, body.Labels...)) + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`[]`)) + return + case r.Method == http.MethodDelete: + w.WriteHeader(http.StatusOK) + return + case r.URL.Path == "/search/issues": + _, _ = w.Write([]byte(`{"total_count": 0}`)) + return + } + _ = json.NewEncoder(w).Encode(issues) + })) + defer server.Close() + + client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL) + poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0)) + + issue, err := poller.findOldestUnprocessedIssue(context.Background()) + if err != nil { + t.Fatalf("findOldestUnprocessedIssue() error = %v", err) + } + if issue == nil { + t.Fatal("expected #43 to be dispatched (skipping exhausted #42)") + } + if issue.Number != 43 { + t.Errorf("got issue #%d, want #43", issue.Number) + } + + added, _ := addedLabels.Load().([]string) + foundExhausted := false + for _, l := range added { + if l == LabelRetryExhausted { + foundExhausted = true + } + } + if !foundExhausted { + t.Errorf("expected pilot-retry-exhausted to be stamped on #42, got %v", added) + } +} + +// pilot-retry-exhausted is terminal — never dispatched. +func TestPoller_RetryLabel_ExhaustedIsTerminal(t *testing.T) { + now := time.Now() + issues := []*Issue{ + {Number: 42, State: "open", Title: "Exhausted", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetryExhausted}}, CreatedAt: now.Add(-1 * time.Hour)}, + {Number: 43, State: "open", Title: "Available", Labels: []Label{{Name: "pilot"}}, CreatedAt: now}, + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(issues) + })) + defer server.Close() + + client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL) + poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0)) + + issue, err := poller.findOldestUnprocessedIssue(context.Background()) + if err != nil { + t.Fatalf("findOldestUnprocessedIssue() error = %v", err) + } + if issue == nil || issue.Number != 43 { + t.Errorf("expected #43, got %v", issue) + } +} + +// RetryStateLabels exposes the canonical list for cleanup on merge. +func TestRetryStateLabels_OrderingAndCompleteness(t *testing.T) { + want := []string{LabelRetry1, LabelRetry2, LabelRetryExhausted} + if len(RetryStateLabels) != len(want) { + t.Fatalf("RetryStateLabels len = %d, want %d", len(RetryStateLabels), len(want)) + } + for i, l := range want { + if RetryStateLabels[i] != l { + t.Errorf("RetryStateLabels[%d] = %q, want %q", i, RetryStateLabels[i], l) + } + } +} diff --git a/internal/adapters/github/poller.go b/internal/adapters/github/poller.go index 6b6fc5d0..76da979c 100644 --- a/internal/adapters/github/poller.go +++ b/internal/adapters/github/poller.go @@ -1324,6 +1324,11 @@ func (p *Poller) shouldRetryFailedIssue(ctx context.Context, issue *Issue) bool // shouldRetryRetryReadyIssue checks if a pilot-retry-ready issue should be auto-retried. // Returns true if the issue should be retried (label removed), false if it should be skipped. // GH-2276: Issues with pilot-retry-ready (PR closed without merge) get retried up to maxRetryReadyRetries times. +// +// GH-2432: Retry counter is persisted via GitHub labels (pilot-retry-1, -2, +// -exhausted) so the count survives `pilot start` restarts. The previous +// in-memory map silently reset on restart, allowing pathological issues to +// consume Opus indefinitely. func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) bool { // Don't retry closed issues if issue.State != "open" { @@ -1339,15 +1344,46 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b return false } - p.mu.RLock() - retries := p.retryReadyCount[issue.Number] - p.mu.RUnlock() + // GH-2432: terminal state — exhausted retries never retry again. + if HasLabel(issue, LabelRetryExhausted) { + p.logger.Warn("Issue is pilot-retry-exhausted, skipping", + slog.Int("number", issue.Number), + ) + return false + } - if retries >= p.maxRetryReadyRetries { - p.logger.Warn("Issue has reached max retry-ready retries, skipping", + // Determine the next retry-counter label based on current state. + var currentRetryLabel, nextRetryLabel string + switch { + case HasLabel(issue, LabelRetry2): + currentRetryLabel = LabelRetry2 + nextRetryLabel = LabelRetryExhausted + case HasLabel(issue, LabelRetry1): + currentRetryLabel = LabelRetry1 + nextRetryLabel = LabelRetry2 + default: + nextRetryLabel = LabelRetry1 + } + + // If escalating to exhausted, mark the label and skip dispatch. + if nextRetryLabel == LabelRetryExhausted { + if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, currentRetryLabel); err != nil { + p.logger.Warn("Failed to remove prior retry label", + slog.Int("number", issue.Number), + slog.String("label", currentRetryLabel), + slog.Any("error", err), + ) + } + if err := p.client.AddLabels(ctx, p.owner, p.repo, issue.Number, []string{LabelRetryExhausted}); err != nil { + p.logger.Warn("Failed to add pilot-retry-exhausted label", + slog.Int("number", issue.Number), + slog.Any("error", err), + ) + } + // Also clear pilot-retry-ready so the poller doesn't keep finding it. + _ = p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, LabelRetryReady) + p.logger.Warn("Issue exhausted retry budget — escalated to pilot-retry-exhausted", slog.Int("number", issue.Number), - slog.Int("retries", retries), - slog.Int("max", p.maxRetryReadyRetries), ) return false } @@ -1357,7 +1393,26 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b return false } - // Remove pilot-retry-ready label and increment retry count + // Swap retry-N label: remove current (if any), add next. + if currentRetryLabel != "" { + if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, currentRetryLabel); err != nil { + p.logger.Warn("Failed to remove prior retry label", + slog.Int("number", issue.Number), + slog.String("label", currentRetryLabel), + slog.Any("error", err), + ) + } + } + if err := p.client.AddLabels(ctx, p.owner, p.repo, issue.Number, []string{nextRetryLabel}); err != nil { + p.logger.Warn("Failed to add retry label", + slog.Int("number", issue.Number), + slog.String("label", nextRetryLabel), + slog.Any("error", err), + ) + // Continue anyway; we'd rather retry than block on a label flake. + } + + // Remove pilot-retry-ready label so the poller doesn't loop the same issue. if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, LabelRetryReady); err != nil { p.logger.Warn("Failed to remove pilot-retry-ready label for retry", slog.Int("number", issue.Number), @@ -1366,8 +1421,10 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b return false } + // Keep the legacy in-memory counter in sync so existing tests/state observers + // remain consistent. GH-2432: this is now a mirror, not the source of truth. p.mu.Lock() - p.retryReadyCount[issue.Number] = retries + 1 + p.retryReadyCount[issue.Number]++ p.mu.Unlock() // Clear from processed map so the issue can be re-picked @@ -1375,8 +1432,7 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b p.logger.Info("Auto-retrying pilot-retry-ready issue", slog.Int("number", issue.Number), - slog.Int("retry", retries+1), - slog.Int("max", p.maxRetryReadyRetries), + slog.String("retry_label", nextRetryLabel), ) return true diff --git a/internal/adapters/github/poller_test.go b/internal/adapters/github/poller_test.go index e1f98d0f..3f20b3e5 100644 --- a/internal/adapters/github/poller_test.go +++ b/internal/adapters/github/poller_test.go @@ -2337,8 +2337,11 @@ func TestPoller_AutoRetryRetryReadyIssue_FirstRetry(t *testing.T) { func TestPoller_AutoRetryRetryReadyIssue_RetryLimitReached(t *testing.T) { now := time.Now() + // GH-2432: Retry state is now persisted via labels — issue already at + // pilot-retry-2 means the next escalation goes to pilot-retry-exhausted + // (no dispatch). issues := []*Issue{ - {Number: 42, State: "open", Title: "Retry-ready issue", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}}, CreatedAt: now.Add(-1 * time.Hour)}, + {Number: 42, State: "open", Title: "Retry-ready issue", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetry2}}, CreatedAt: now.Add(-1 * time.Hour)}, {Number: 43, State: "open", Title: "Available issue", Labels: []Label{{Name: "pilot"}}, CreatedAt: now}, } @@ -2353,11 +2356,6 @@ func TestPoller_AutoRetryRetryReadyIssue_RetryLimitReached(t *testing.T) { WithMaxRetryReadyRetries(3), ) - // Simulate: already retried 3 times - poller.mu.Lock() - poller.retryReadyCount[42] = 3 - poller.mu.Unlock() - issue, err := poller.findOldestUnprocessedIssue(context.Background()) if err != nil { t.Fatalf("findOldestUnprocessedIssue() error = %v", err) diff --git a/internal/adapters/github/types.go b/internal/adapters/github/types.go index 4e368390..11acd0d8 100644 --- a/internal/adapters/github/types.go +++ b/internal/adapters/github/types.go @@ -98,8 +98,18 @@ const ( LabelTitleRejected = "pilot-title-rejected" // GH-2363: title guard escalation; blocks auto-retry until human edits title LabelSuperseded = "pilot-superseded" // GH-2402: sub-issue auto-closed because parent epic already shipped the work LabelBlocked = "pilot-blocked" // GH-2402: deterministic failure that won't change between retries (e.g. non-conventional title) + + // GH-2432: Retry-counter labels persist retry state across `pilot start` + // restarts (the in-memory retryReadyCount map was lost on restart, letting + // broken issues consume Opus indefinitely). + LabelRetry1 = "pilot-retry-1" + LabelRetry2 = "pilot-retry-2" + LabelRetryExhausted = "pilot-retry-exhausted" ) +// RetryStateLabels lists the retry-counter labels in escalation order. GH-2432. +var RetryStateLabels = []string{LabelRetry1, LabelRetry2, LabelRetryExhausted} + // Priority mapping from GitHub labels type Priority int diff --git a/internal/autopilot/auto_merger.go b/internal/autopilot/auto_merger.go index 03efb26d..af6283bf 100644 --- a/internal/autopilot/auto_merger.go +++ b/internal/autopilot/auto_merger.go @@ -102,6 +102,18 @@ func (m *AutoMerger) MergePR(ctx context.Context, prState *PRState) error { } m.log.Info("PR merged", "pr", prState.PRNumber, "method", mergeMethod) + + // GH-2432: Strip retry-counter labels off the linked issue so a future + // regression on the same issue starts the retry budget from zero again. + if prState.IssueNumber > 0 { + for _, label := range github.RetryStateLabels { + if err := m.ghClient.RemoveLabel(ctx, m.owner, m.repo, prState.IssueNumber, label); err != nil { + m.log.Debug("retry label cleanup: remove failed (likely not present)", + "issue", prState.IssueNumber, "label", label, "error", err) + } + } + } + return nil } diff --git a/internal/executor/backend.go b/internal/executor/backend.go index c74583e0..73201a3e 100644 --- a/internal/executor/backend.go +++ b/internal/executor/backend.go @@ -69,6 +69,14 @@ type ExecuteOptions struct { // The callback receives the process PID and the watchdog timeout duration. // Called BEFORE the process is killed, allowing for alert emission. WatchdogCallback func(pid int, watchdogTimeout time.Duration) + + // AllowedTools restricts the set of tools the subprocess may use. + // Maps to the Claude Code CLI --allowedTools flag. GH-2432. + AllowedTools []string + + // MCPConfigPath is passed to the subprocess via --mcp-config. When empty, + // no MCP servers are loaded (avoiding tool-definition token bloat). GH-2432. + MCPConfigPath string } // BackendEvent represents a streaming event from the backend. @@ -261,6 +269,10 @@ type BackendConfig struct { // Hooks contains Claude Code hooks settings for quality gates during execution Hooks *HooksConfig `yaml:"hooks,omitempty"` + // Planning controls the model used for epic planning subprocesses (GH-2432). + // Planning gets Opus while execution stays on Sonnet for cost savings. + Planning *PlanningConfig `yaml:"planning,omitempty"` + // UseWorktree enables git worktree isolation for execution. // When true, Pilot creates a temporary worktree for each task, allowing // execution even when the user has uncommitted changes in their working directory. @@ -560,6 +572,44 @@ type ClaudeCodeConfig struct { // on long runs. When true, such tasks fall back to the lean non-Navigator // prompt. Default: false (Navigator context always injected when available). DisableNavigatorForEpic bool `yaml:"disable_navigator_for_epic,omitempty"` + + // AllowedTools restricts the set of tools the Claude Code subprocess may use. + // Maps to the CLI's --allowedTools flag. Default (when unset, see + // DefaultAllowedToolsExecution) keeps the standard execution toolbox while + // excluding heavy MCP-loaded tools, cutting per-turn token cost. GH-2432. + AllowedTools []string `yaml:"allowed_tools,omitempty"` + + // MCPConfigPath points to an MCP server config file passed via --mcp-config. + // When empty, the subprocess does NOT load any MCP servers — drastically + // reducing tool-definition tokens replayed on every turn. GH-2432. + MCPConfigPath string `yaml:"mcp_config_path,omitempty"` +} + +// PlanningConfig controls the model and behavior for epic planning subprocesses. +// Planning runs benefit from stronger reasoning (Opus); execution runs stay on +// the cheaper, verbose-friendly Sonnet. GH-2432. +type PlanningConfig struct { + // Model is the Claude model name passed to the planning subprocess via + // --model and the ANTHROPIC_MODEL env var. Default: "claude-opus-4-7". + Model string `yaml:"model,omitempty"` +} + +// DefaultPlanningConfig returns the default planning config (Opus 4.7). +func DefaultPlanningConfig() *PlanningConfig { + return &PlanningConfig{Model: "claude-opus-4-7"} +} + +// DefaultAllowedToolsExecution is the default --allowedTools list for execution +// subprocesses. Excludes MCP and Web* tools to cut per-turn context bloat. +// GH-2432. +func DefaultAllowedToolsExecution() []string { + return []string{"Read", "Write", "Edit", "Bash", "Grep", "Glob", "Task"} +} + +// DefaultAllowedToolsPlanning is the default --allowedTools list for the +// planning subprocess. Planning should not write code. GH-2432. +func DefaultAllowedToolsPlanning() []string { + return []string{"Read", "Grep", "Glob"} } // QwenCodeConfig contains Qwen Code backend configuration. @@ -607,7 +657,8 @@ func DefaultBackendConfig() *BackendConfig { PrePushLint: &prePushLint, PlanningTimeout: 2 * time.Minute, ClaudeCode: &ClaudeCodeConfig{ - Command: "claude", + Command: "claude", + AllowedTools: DefaultAllowedToolsExecution(), }, QwenCode: &QwenCodeConfig{ Command: "qwen", @@ -627,6 +678,7 @@ func DefaultBackendConfig() *BackendConfig { IntentJudge: DefaultIntentJudgeConfig(), Navigator: DefaultNavigatorConfig(), Hooks: DefaultHooksConfig(), + Planning: DefaultPlanningConfig(), Retry: DefaultRetryConfig(), Stagnation: DefaultStagnationConfig(), Simplification: DefaultSimplifyConfig(), @@ -649,7 +701,8 @@ func DefaultModelRoutingConfig() *ModelRoutingConfig { Trivial: "claude-haiku", Simple: "claude-sonnet-4-6", Medium: "claude-sonnet-4-6", - Complex: "claude-opus-4-6", + // GH-2432: Sonnet for "complex" too — Opus is reserved for planning only. + Complex: "claude-sonnet-4-6", } } diff --git a/internal/executor/backend_claudecode.go b/internal/executor/backend_claudecode.go index f28994d9..8dd52240 100644 --- a/internal/executor/backend_claudecode.go +++ b/internal/executor/backend_claudecode.go @@ -355,6 +355,16 @@ func (b *ClaudeCodeBackend) executeWithFromPR(ctx context.Context, opts ExecuteO b.log.Info("Using routed effort", slog.String("effort", opts.Effort)) } + // GH-2432: --allowedTools restricts the subprocess toolbox; --mcp-config + // scopes which MCP servers (if any) the subprocess loads. Both cut the + // per-turn token cost considerably when MCPs are not needed. + if len(opts.AllowedTools) > 0 { + args = append(args, "--allowedTools", strings.Join(opts.AllowedTools, ",")) + } + if opts.MCPConfigPath != "" { + args = append(args, "--mcp-config", opts.MCPConfigPath) + } + args = append(args, b.config.ExtraArgs...) cmd := exec.CommandContext(ctx, b.config.Command, args...) diff --git a/internal/executor/epic.go b/internal/executor/epic.go index dd8d1ea0..266db1c4 100644 --- a/internal/executor/epic.go +++ b/internal/executor/epic.go @@ -5,6 +5,7 @@ import ( "bytes" "context" "fmt" + "os" "os/exec" "regexp" "strconv" @@ -201,10 +202,25 @@ func (r *Runner) PlanEpic(ctx context.Context, task *Task, executionPath string) claudeCmd = r.config.ClaudeCode.Command } - // Run Claude Code with --print flag for planning - args := []string{"--print", "-p", prompt} + // GH-2432: Planning gets Opus for stronger reasoning; execution stays on + // Sonnet (set via the regular runner path). The model is also exported via + // ANTHROPIC_MODEL because Pilot's global env may otherwise win on Node's + // last-write lookup inside Claude Code (see backend_claudecode.go). + planningModel := "claude-opus-4-7" + if r.config != nil && r.config.Planning != nil && r.config.Planning.Model != "" { + planningModel = r.config.Planning.Model + } + + // Run Claude Code with --print flag for planning. Restrict tools to + // read-only — planning must not write code. + args := []string{ + "--print", "-p", prompt, + "--model", planningModel, + "--allowedTools", strings.Join(DefaultAllowedToolsPlanning(), ","), + } cmd := exec.CommandContext(ctx, claudeCmd, args...) + cmd.Env = append(os.Environ(), "ANTHROPIC_MODEL="+planningModel) // Set working directory - use executionPath which respects worktree isolation if executionPath != "" { diff --git a/internal/executor/gh2432_test.go b/internal/executor/gh2432_test.go new file mode 100644 index 00000000..e994cbca --- /dev/null +++ b/internal/executor/gh2432_test.go @@ -0,0 +1,128 @@ +package executor + +import ( + "reflect" + "strings" + "testing" +) + +// GH-2432: "Opus plans, Sonnet executes" — verify the Sonnet/Opus split, +// AllowedTools/MCPConfig wiring, and Stop-hook default flip. + +func TestDefaultBackendConfig_SonnetForComplex(t *testing.T) { + cfg := DefaultBackendConfig() + if got, want := cfg.ModelRouting.Complex, "claude-sonnet-4-6"; got != want { + t.Errorf("ModelRouting.Complex = %q, want %q (GH-2432: Opus reserved for planning only)", got, want) + } +} + +func TestDefaultPlanningConfig_OpusModel(t *testing.T) { + cfg := DefaultPlanningConfig() + if cfg.Model != "claude-opus-4-7" { + t.Errorf("Planning.Model default = %q, want claude-opus-4-7", cfg.Model) + } +} + +func TestDefaultBackendConfig_PlanningWired(t *testing.T) { + cfg := DefaultBackendConfig() + if cfg.Planning == nil { + t.Fatal("DefaultBackendConfig() should populate Planning (GH-2432)") + } + if cfg.Planning.Model != "claude-opus-4-7" { + t.Errorf("DefaultBackendConfig.Planning.Model = %q, want claude-opus-4-7", cfg.Planning.Model) + } +} + +func TestDefaultClaudeCodeConfig_AllowedToolsExecution(t *testing.T) { + cfg := DefaultBackendConfig() + if cfg.ClaudeCode == nil { + t.Fatal("DefaultBackendConfig.ClaudeCode is nil") + } + want := DefaultAllowedToolsExecution() + if !reflect.DeepEqual(cfg.ClaudeCode.AllowedTools, want) { + t.Errorf("ClaudeCode.AllowedTools = %v, want %v", cfg.ClaudeCode.AllowedTools, want) + } + if cfg.ClaudeCode.MCPConfigPath != "" { + t.Errorf("ClaudeCode.MCPConfigPath = %q, want empty (no MCPs by default)", cfg.ClaudeCode.MCPConfigPath) + } +} + +func TestDefaultAllowedTools_PlanningIsReadOnly(t *testing.T) { + tools := DefaultAllowedToolsPlanning() + for _, banned := range []string{"Write", "Edit", "Bash"} { + for _, tool := range tools { + if tool == banned { + t.Errorf("planning tools contain %q — planning must be read-only", banned) + } + } + } + for _, required := range []string{"Read", "Grep", "Glob"} { + found := false + for _, tool := range tools { + if tool == required { + found = true + break + } + } + if !found { + t.Errorf("planning tools missing required tool %q", required) + } + } +} + +func TestExecuteOptions_HasAllowedToolsAndMCPConfigPath(t *testing.T) { + // Compile-time check via field access. + opts := ExecuteOptions{ + AllowedTools: []string{"Read", "Bash"}, + MCPConfigPath: "/tmp/mcp.json", + } + if len(opts.AllowedTools) != 2 || opts.MCPConfigPath != "/tmp/mcp.json" { + t.Error("ExecuteOptions.AllowedTools/MCPConfigPath not wired (GH-2432)") + } +} + +func TestRunner_ExecutionToolOptions_FromConfig(t *testing.T) { + r := &Runner{ + config: &BackendConfig{ + ClaudeCode: &ClaudeCodeConfig{ + AllowedTools: []string{"Read", "Edit"}, + MCPConfigPath: "/path/to/mcp.json", + }, + }, + } + allowed, mcp := r.executionToolOptions() + if !reflect.DeepEqual(allowed, []string{"Read", "Edit"}) { + t.Errorf("allowed = %v, want [Read Edit]", allowed) + } + if mcp != "/path/to/mcp.json" { + t.Errorf("mcp = %q, want /path/to/mcp.json", mcp) + } +} + +func TestRunner_ExecutionToolOptions_NilSafe(t *testing.T) { + r := &Runner{config: nil} + allowed, mcp := r.executionToolOptions() + if allowed != nil || mcp != "" { + t.Errorf("nil config: allowed=%v mcp=%q, want nil/empty", allowed, mcp) + } +} + +func TestDefaultHooksConfig_RunTestsOnStop_FlippedToFalse(t *testing.T) { + cfg := DefaultHooksConfig() + if cfg.RunTestsOnStop == nil { + t.Fatal("RunTestsOnStop is nil") + } + if *cfg.RunTestsOnStop { + t.Error("RunTestsOnStop should default to false (GH-2432: cut subprocess token spend)") + } +} + +// Verify the planning tools list contains exactly what we ship by default — +// guards against accidental write-tool additions. +func TestDefaultAllowedToolsPlanning_Exact(t *testing.T) { + got := strings.Join(DefaultAllowedToolsPlanning(), ",") + want := "Read,Grep,Glob" + if got != want { + t.Errorf("planning tools = %q, want %q", got, want) + } +} diff --git a/internal/executor/hooks.go b/internal/executor/hooks.go index c809a2cf..33ce428f 100644 --- a/internal/executor/hooks.go +++ b/internal/executor/hooks.go @@ -46,8 +46,12 @@ type HooksConfig struct { } // DefaultHooksConfig returns default hooks configuration. +// GH-2432: RunTestsOnStop default flipped to false. Stop-hook tests forced +// long unproductive turns into the Claude session, inflating token cost +// without comparable quality gain. Quality gates still run after the +// subprocess exits. func DefaultHooksConfig() *HooksConfig { - runTestsOnStop := true + runTestsOnStop := false blockDestructive := true return &HooksConfig{ Enabled: false, // Disabled by default, opt-in feature diff --git a/internal/executor/hooks_test.go b/internal/executor/hooks_test.go index ab53878a..4518014c 100644 --- a/internal/executor/hooks_test.go +++ b/internal/executor/hooks_test.go @@ -13,8 +13,9 @@ func TestHooksConfig_Defaults(t *testing.T) { if config.Enabled { t.Error("Expected hooks to be disabled by default") } - if config.RunTestsOnStop == nil || !*config.RunTestsOnStop { - t.Error("Expected RunTestsOnStop to default to true when enabled") + // GH-2432: RunTestsOnStop default flipped to false to cut subprocess token spend. + if config.RunTestsOnStop == nil || *config.RunTestsOnStop { + t.Error("Expected RunTestsOnStop to default to false (GH-2432)") } if config.BlockDestructive == nil || !*config.BlockDestructive { t.Error("Expected BlockDestructive to default to true when enabled") diff --git a/internal/executor/runner.go b/internal/executor/runner.go index d070519c..55a96daf 100644 --- a/internal/executor/runner.go +++ b/internal/executor/runner.go @@ -584,6 +584,16 @@ func (r *Runner) fallbackModelName() string { return r.backendType() } +// executionToolOptions returns the AllowedTools and MCPConfigPath that should +// be applied to every backend.Execute call site driven by this Runner. These +// shave the per-turn token cost by scoping the subprocess toolbox. GH-2432. +func (r *Runner) executionToolOptions() (allowed []string, mcpPath string) { + if r.config != nil && r.config.ClaudeCode != nil { + return r.config.ClaudeCode.AllowedTools, r.config.ClaudeCode.MCPConfigPath + } + return nil, "" +} + // SetBackend changes the execution backend. func (r *Runner) SetBackend(backend Backend) { r.backend = backend @@ -1603,6 +1613,7 @@ func (r *Runner) executeWithOptions(ctx context.Context, task *Task, allowWorktr // Watchdog kills subprocess after 2x timeout as a safety net for processes // that ignore context cancellation. watchdogTimeout := 2 * timeout + allowedTools, mcpConfigPath := r.executionToolOptions() backendResult, err := r.backend.Execute(ctx, ExecuteOptions{ Prompt: prompt, ProjectPath: executionPath, // Use worktree path if active @@ -1611,6 +1622,8 @@ func (r *Runner) executeWithOptions(ctx context.Context, task *Task, allowWorktr Effort: selectedEffort, FromPR: task.FromPR, // GH-1267: session resumption from PR context WatchdogTimeout: watchdogTimeout, + AllowedTools: allowedTools, + MCPConfigPath: mcpConfigPath, WatchdogCallback: func(pid int, watchdogDuration time.Duration) { log.Warn("Watchdog killed subprocess", slog.Int("pid", pid), @@ -1851,6 +1864,7 @@ func (r *Runner) executeWithOptions(ctx context.Context, task *Task, allowWorktr r.reportProgress(task.ID, "Re-executing", 55, fmt.Sprintf("Retry attempt %d with %v timeout...", state.smartRetryAttempt, retryTimeout)) + smartAllowed, smartMCP := r.executionToolOptions() retryResult, retryErr := r.backend.Execute(retryCtx, ExecuteOptions{ Prompt: prompt, ProjectPath: task.ProjectPath, @@ -1858,6 +1872,8 @@ func (r *Runner) executeWithOptions(ctx context.Context, task *Task, allowWorktr Model: selectedModel, Effort: selectedEffort, WatchdogTimeout: 2 * retryTimeout, + AllowedTools: smartAllowed, + MCPConfigPath: smartMCP, EventHandler: func(event BackendEvent) { if recorder != nil { _ = recorder.RecordEvent(event.Raw) @@ -2145,6 +2161,7 @@ The previous execution completed but made no code changes. This task requires ac %s`, task.Title, task.Description) // Execute retry + noopRetryAllowed, noopRetryMCP := r.executionToolOptions() retryResult, retryErr := r.backend.Execute(ctx, ExecuteOptions{ Prompt: retryPrompt, ProjectPath: task.ProjectPath, @@ -2152,6 +2169,8 @@ The previous execution completed but made no code changes. This task requires ac Model: selectedModel, Effort: selectedEffort, WatchdogTimeout: watchdogTimeout, + AllowedTools: noopRetryAllowed, + MCPConfigPath: noopRetryMCP, EventHandler: func(event BackendEvent) { // Track tokens from retry state.tokensInput += event.TokensInput @@ -2416,12 +2435,15 @@ The previous execution completed but made no code changes. This task requires ac ) // Re-invoke backend with retry prompt + feedbackAllowed, feedbackMCP := r.executionToolOptions() retryResult, retryErr := r.backend.Execute(ctx, ExecuteOptions{ - Prompt: retryPrompt, - ProjectPath: task.ProjectPath, - Verbose: task.Verbose, - Model: selectedModel, - Effort: selectedEffort, + Prompt: retryPrompt, + ProjectPath: task.ProjectPath, + Verbose: task.Verbose, + Model: selectedModel, + Effort: selectedEffort, + AllowedTools: feedbackAllowed, + MCPConfigPath: feedbackMCP, EventHandler: func(event BackendEvent) { if recorder != nil { if recErr := recorder.RecordEvent(event.Raw); recErr != nil { @@ -2692,12 +2714,15 @@ The previous execution completed but made no code changes. This task requires ac intentVerdict.Reason, task.Title, task.Description, ) + intentAllowed, intentMCP := r.executionToolOptions() _, retryErr := r.backend.Execute(ctx, ExecuteOptions{ - Prompt: retryPrompt, - ProjectPath: task.ProjectPath, - Verbose: task.Verbose, - Model: selectedModel, - Effort: selectedEffort, + Prompt: retryPrompt, + ProjectPath: task.ProjectPath, + Verbose: task.Verbose, + Model: selectedModel, + Effort: selectedEffort, + AllowedTools: intentAllowed, + MCPConfigPath: intentMCP, EventHandler: func(event BackendEvent) { state.tokensInput += event.TokensInput state.tokensOutput += event.TokensOutput @@ -3305,6 +3330,7 @@ func (r *Runner) runSelfReview(ctx context.Context, task *Task, state *progressS } } + reviewAllowed, reviewMCP := r.executionToolOptions() result, err := r.backend.Execute(reviewCtx, ExecuteOptions{ Prompt: reviewPrompt, ProjectPath: task.ProjectPath, @@ -3312,6 +3338,8 @@ func (r *Runner) runSelfReview(ctx context.Context, task *Task, state *progressS Model: selectedModel, Effort: selectedEffort, ResumeSessionID: resumeSessionID, + AllowedTools: reviewAllowed, + MCPConfigPath: reviewMCP, EventHandler: func(event BackendEvent) { // Track tokens from self-review state.tokensInput += event.TokensInput