Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions internal/adapters/github/gh2432_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package github

import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"

"github.com/qf-studio/pilot/internal/testutil"
)

// GH-2432: Retry counter is persisted via labels (pilot-retry-1, pilot-retry-2,
// pilot-retry-exhausted) so the budget survives `pilot start` restarts.

// First retry on a fresh pilot-retry-ready issue should add pilot-retry-1.
func TestPoller_RetryLabel_FirstAttemptAddsRetry1(t *testing.T) {
now := time.Now()
issues := []*Issue{
{Number: 42, State: "open", Title: "Retry-ready", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}}, CreatedAt: now.Add(-1 * time.Hour)},
}

var addedLabels atomic.Value
addedLabels.Store([]string(nil))

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch {
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/issues/42/labels"):
var body struct {
Labels []string `json:"labels"`
}
_ = json.NewDecoder(r.Body).Decode(&body)
prev, _ := addedLabels.Load().([]string)
addedLabels.Store(append(prev, body.Labels...))
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`[]`))
return
case r.Method == http.MethodDelete:
w.WriteHeader(http.StatusOK)
return
case r.URL.Path == "/search/issues":
_, _ = w.Write([]byte(`{"total_count": 0}`))
return
}
_ = json.NewEncoder(w).Encode(issues)
}))
defer server.Close()

client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL)
poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0))

issue, err := poller.findOldestUnprocessedIssue(context.Background())
if err != nil {
t.Fatalf("findOldestUnprocessedIssue() error = %v", err)
}
if issue == nil || issue.Number != 42 {
t.Fatalf("expected #42 to be dispatched, got %v", issue)
}

added, _ := addedLabels.Load().([]string)
foundRetry1 := false
for _, l := range added {
if l == LabelRetry1 {
foundRetry1 = true
}
}
if !foundRetry1 {
t.Errorf("expected pilot-retry-1 to be added, got labels=%v", added)
}
}

// pilot-retry-2 + pilot-retry-ready → escalate to pilot-retry-exhausted, no dispatch.
func TestPoller_RetryLabel_Retry2EscalatesToExhausted(t *testing.T) {
now := time.Now()
issues := []*Issue{
{Number: 42, State: "open", Title: "Retry-ready", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetry2}}, CreatedAt: now.Add(-1 * time.Hour)},
{Number: 43, State: "open", Title: "Available", Labels: []Label{{Name: "pilot"}}, CreatedAt: now},
}

var addedLabels atomic.Value
addedLabels.Store([]string(nil))

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch {
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/issues/42/labels"):
var body struct {
Labels []string `json:"labels"`
}
_ = json.NewDecoder(r.Body).Decode(&body)
prev, _ := addedLabels.Load().([]string)
addedLabels.Store(append(prev, body.Labels...))
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`[]`))
return
case r.Method == http.MethodDelete:
w.WriteHeader(http.StatusOK)
return
case r.URL.Path == "/search/issues":
_, _ = w.Write([]byte(`{"total_count": 0}`))
return
}
_ = json.NewEncoder(w).Encode(issues)
}))
defer server.Close()

client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL)
poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0))

issue, err := poller.findOldestUnprocessedIssue(context.Background())
if err != nil {
t.Fatalf("findOldestUnprocessedIssue() error = %v", err)
}
if issue == nil {
t.Fatal("expected #43 to be dispatched (skipping exhausted #42)")
}
if issue.Number != 43 {
t.Errorf("got issue #%d, want #43", issue.Number)
}

added, _ := addedLabels.Load().([]string)
foundExhausted := false
for _, l := range added {
if l == LabelRetryExhausted {
foundExhausted = true
}
}
if !foundExhausted {
t.Errorf("expected pilot-retry-exhausted to be stamped on #42, got %v", added)
}
}

// pilot-retry-exhausted is terminal — never dispatched.
func TestPoller_RetryLabel_ExhaustedIsTerminal(t *testing.T) {
now := time.Now()
issues := []*Issue{
{Number: 42, State: "open", Title: "Exhausted", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetryExhausted}}, CreatedAt: now.Add(-1 * time.Hour)},
{Number: 43, State: "open", Title: "Available", Labels: []Label{{Name: "pilot"}}, CreatedAt: now},
}

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(issues)
}))
defer server.Close()

client := NewClientWithBaseURL(testutil.FakeGitHubToken, server.URL)
poller, _ := NewPoller(client, "owner/repo", "pilot", 30*time.Second, WithRetryGracePeriod(0))

issue, err := poller.findOldestUnprocessedIssue(context.Background())
if err != nil {
t.Fatalf("findOldestUnprocessedIssue() error = %v", err)
}
if issue == nil || issue.Number != 43 {
t.Errorf("expected #43, got %v", issue)
}
}

// RetryStateLabels exposes the canonical list for cleanup on merge.
func TestRetryStateLabels_OrderingAndCompleteness(t *testing.T) {
want := []string{LabelRetry1, LabelRetry2, LabelRetryExhausted}
if len(RetryStateLabels) != len(want) {
t.Fatalf("RetryStateLabels len = %d, want %d", len(RetryStateLabels), len(want))
}
for i, l := range want {
if RetryStateLabels[i] != l {
t.Errorf("RetryStateLabels[%d] = %q, want %q", i, RetryStateLabels[i], l)
}
}
}
78 changes: 67 additions & 11 deletions internal/adapters/github/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,11 @@ func (p *Poller) shouldRetryFailedIssue(ctx context.Context, issue *Issue) bool
// shouldRetryRetryReadyIssue checks if a pilot-retry-ready issue should be auto-retried.
// Returns true if the issue should be retried (label removed), false if it should be skipped.
// GH-2276: Issues with pilot-retry-ready (PR closed without merge) get retried up to maxRetryReadyRetries times.
//
// GH-2432: Retry counter is persisted via GitHub labels (pilot-retry-1, -2,
// -exhausted) so the count survives `pilot start` restarts. The previous
// in-memory map silently reset on restart, allowing pathological issues to
// consume Opus indefinitely.
func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) bool {
// Don't retry closed issues
if issue.State != "open" {
Expand All @@ -1339,15 +1344,46 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b
return false
}

p.mu.RLock()
retries := p.retryReadyCount[issue.Number]
p.mu.RUnlock()
// GH-2432: terminal state — exhausted retries never retry again.
if HasLabel(issue, LabelRetryExhausted) {
p.logger.Warn("Issue is pilot-retry-exhausted, skipping",
slog.Int("number", issue.Number),
)
return false
}

if retries >= p.maxRetryReadyRetries {
p.logger.Warn("Issue has reached max retry-ready retries, skipping",
// Determine the next retry-counter label based on current state.
var currentRetryLabel, nextRetryLabel string
switch {
case HasLabel(issue, LabelRetry2):
currentRetryLabel = LabelRetry2
nextRetryLabel = LabelRetryExhausted
case HasLabel(issue, LabelRetry1):
currentRetryLabel = LabelRetry1
nextRetryLabel = LabelRetry2
default:
nextRetryLabel = LabelRetry1
}

// If escalating to exhausted, mark the label and skip dispatch.
if nextRetryLabel == LabelRetryExhausted {
if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, currentRetryLabel); err != nil {
p.logger.Warn("Failed to remove prior retry label",
slog.Int("number", issue.Number),
slog.String("label", currentRetryLabel),
slog.Any("error", err),
)
}
if err := p.client.AddLabels(ctx, p.owner, p.repo, issue.Number, []string{LabelRetryExhausted}); err != nil {
p.logger.Warn("Failed to add pilot-retry-exhausted label",
slog.Int("number", issue.Number),
slog.Any("error", err),
)
}
// Also clear pilot-retry-ready so the poller doesn't keep finding it.
_ = p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, LabelRetryReady)
p.logger.Warn("Issue exhausted retry budget — escalated to pilot-retry-exhausted",
slog.Int("number", issue.Number),
slog.Int("retries", retries),
slog.Int("max", p.maxRetryReadyRetries),
)
return false
}
Expand All @@ -1357,7 +1393,26 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b
return false
}

// Remove pilot-retry-ready label and increment retry count
// Swap retry-N label: remove current (if any), add next.
if currentRetryLabel != "" {
if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, currentRetryLabel); err != nil {
p.logger.Warn("Failed to remove prior retry label",
slog.Int("number", issue.Number),
slog.String("label", currentRetryLabel),
slog.Any("error", err),
)
}
}
if err := p.client.AddLabels(ctx, p.owner, p.repo, issue.Number, []string{nextRetryLabel}); err != nil {
p.logger.Warn("Failed to add retry label",
slog.Int("number", issue.Number),
slog.String("label", nextRetryLabel),
slog.Any("error", err),
)
// Continue anyway; we'd rather retry than block on a label flake.
}

// Remove pilot-retry-ready label so the poller doesn't loop the same issue.
if err := p.client.RemoveLabel(ctx, p.owner, p.repo, issue.Number, LabelRetryReady); err != nil {
p.logger.Warn("Failed to remove pilot-retry-ready label for retry",
slog.Int("number", issue.Number),
Expand All @@ -1366,17 +1421,18 @@ func (p *Poller) shouldRetryRetryReadyIssue(ctx context.Context, issue *Issue) b
return false
}

// Keep the legacy in-memory counter in sync so existing tests/state observers
// remain consistent. GH-2432: this is now a mirror, not the source of truth.
p.mu.Lock()
p.retryReadyCount[issue.Number] = retries + 1
p.retryReadyCount[issue.Number]++
p.mu.Unlock()

// Clear from processed map so the issue can be re-picked
p.ClearProcessed(issue.Number)

p.logger.Info("Auto-retrying pilot-retry-ready issue",
slog.Int("number", issue.Number),
slog.Int("retry", retries+1),
slog.Int("max", p.maxRetryReadyRetries),
slog.String("retry_label", nextRetryLabel),
)

return true
Expand Down
10 changes: 4 additions & 6 deletions internal/adapters/github/poller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2337,8 +2337,11 @@ func TestPoller_AutoRetryRetryReadyIssue_FirstRetry(t *testing.T) {

func TestPoller_AutoRetryRetryReadyIssue_RetryLimitReached(t *testing.T) {
now := time.Now()
// GH-2432: Retry state is now persisted via labels — issue already at
// pilot-retry-2 means the next escalation goes to pilot-retry-exhausted
// (no dispatch).
issues := []*Issue{
{Number: 42, State: "open", Title: "Retry-ready issue", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}}, CreatedAt: now.Add(-1 * time.Hour)},
{Number: 42, State: "open", Title: "Retry-ready issue", Labels: []Label{{Name: "pilot"}, {Name: LabelRetryReady}, {Name: LabelRetry2}}, CreatedAt: now.Add(-1 * time.Hour)},
{Number: 43, State: "open", Title: "Available issue", Labels: []Label{{Name: "pilot"}}, CreatedAt: now},
}

Expand All @@ -2353,11 +2356,6 @@ func TestPoller_AutoRetryRetryReadyIssue_RetryLimitReached(t *testing.T) {
WithMaxRetryReadyRetries(3),
)

// Simulate: already retried 3 times
poller.mu.Lock()
poller.retryReadyCount[42] = 3
poller.mu.Unlock()

issue, err := poller.findOldestUnprocessedIssue(context.Background())
if err != nil {
t.Fatalf("findOldestUnprocessedIssue() error = %v", err)
Expand Down
10 changes: 10 additions & 0 deletions internal/adapters/github/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,18 @@ const (
LabelTitleRejected = "pilot-title-rejected" // GH-2363: title guard escalation; blocks auto-retry until human edits title
LabelSuperseded = "pilot-superseded" // GH-2402: sub-issue auto-closed because parent epic already shipped the work
LabelBlocked = "pilot-blocked" // GH-2402: deterministic failure that won't change between retries (e.g. non-conventional title)

// GH-2432: Retry-counter labels persist retry state across `pilot start`
// restarts (the in-memory retryReadyCount map was lost on restart, letting
// broken issues consume Opus indefinitely).
LabelRetry1 = "pilot-retry-1"
LabelRetry2 = "pilot-retry-2"
LabelRetryExhausted = "pilot-retry-exhausted"
)

// RetryStateLabels lists the retry-counter labels in escalation order. GH-2432.
var RetryStateLabels = []string{LabelRetry1, LabelRetry2, LabelRetryExhausted}

// Priority mapping from GitHub labels
type Priority int

Expand Down
12 changes: 12 additions & 0 deletions internal/autopilot/auto_merger.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ func (m *AutoMerger) MergePR(ctx context.Context, prState *PRState) error {
}

m.log.Info("PR merged", "pr", prState.PRNumber, "method", mergeMethod)

// GH-2432: Strip retry-counter labels off the linked issue so a future
// regression on the same issue starts the retry budget from zero again.
if prState.IssueNumber > 0 {
for _, label := range github.RetryStateLabels {
if err := m.ghClient.RemoveLabel(ctx, m.owner, m.repo, prState.IssueNumber, label); err != nil {
m.log.Debug("retry label cleanup: remove failed (likely not present)",
"issue", prState.IssueNumber, "label", label, "error", err)
}
}
}

return nil
}

Expand Down
Loading
Loading