diff --git a/actions/setup/js/copilot_sdk_driver.test.cjs b/actions/setup/js/copilot_sdk_driver.test.cjs index ff1f3ade9d4..f4ac95197c9 100644 --- a/actions/setup/js/copilot_sdk_driver.test.cjs +++ b/actions/setup/js/copilot_sdk_driver.test.cjs @@ -332,6 +332,429 @@ describe("copilot_sdk_driver.cjs", () => { expect(result.hasOutput).toBe(false); }); + it("post-completion idle watchdog fires and treats session as completed", async () => { + // Regression test: when sendAndWait hangs after the agent's final tool result + // (the SDK post-completion hang), the watchdog must force-disconnect and the + // driver must return exitCode 0 with the collected output. + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + // disconnectCalled resolves when the watchdog calls session.disconnect() + let resolveDisconnect; + const disconnectCalled = new Promise(resolve => { + resolveDisconnect = resolve; + }); + const disconnectWithSignal = vi.fn().mockImplementation(() => { + resolveDisconnect(); + return Promise.resolve(undefined); + }); + + const session = { + sessionId: "session-watchdog-fires", + on: handler => { + onEvent = handler; + }, + // sendAndWait emits events that satisfy completion conditions, then hangs + // until the watchdog forces a disconnect. + sendAndWait: vi.fn().mockImplementation(async () => { + onEvent({ + type: "tool.execution_start", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolName: "create_issue", mcpServerName: "safeoutputs", toolCallId: "call-watchdog" }, + }); + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "Issue filed successfully" }, + }); + onEvent({ + type: "tool.execution_complete", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolCallId: "call-watchdog", success: true }, + }); + // Simulate sendAndWait hanging — wait until the watchdog disconnects. + await disconnectCalled; + throw new Error("transport disconnected"); + }), + disconnect: disconnectWithSignal, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const prevIdleMs = process.env.GH_AW_SDK_IDLE_MS; + // Use a very short idle timeout so the watchdog fires quickly in tests. + process.env.GH_AW_SDK_IDLE_MS = "20"; + try { + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + expect(result.hasOutput).toBe(true); + expect(result.output).toContain("Issue filed successfully"); + // disconnect is called twice: once by the watchdog and once in finally. + expect(disconnectWithSignal).toHaveBeenCalled(); + expect(stop).toHaveBeenCalledTimes(1); + } finally { + if (prevIdleMs === undefined) delete process.env.GH_AW_SDK_IDLE_MS; + else process.env.GH_AW_SDK_IDLE_MS = prevIdleMs; + } + }); + + it("post-completion watchdog does not fire when tool calls are still pending", async () => { + // When a new tool call starts after the watchdog would have been armed, + // the watchdog must be disarmed so it does not fire while work is in progress. + const disconnect = vi.fn().mockResolvedValue(undefined); + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + + const session = { + sessionId: "session-watchdog-disarmed", + on: handler => { + onEvent = handler; + }, + sendAndWait: vi.fn().mockImplementation(async () => { + // First, reach the "post-completion" state that would arm the watchdog. + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "still working" }, + }); + // Then start a new tool call — this must disarm the watchdog. + onEvent({ + type: "tool.execution_start", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-new" }, + }); + // Complete the new tool call and produce more output. + onEvent({ + type: "tool.execution_complete", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolCallId: "call-new", success: true }, + }); + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "done now" }, + }); + // sendAndWait resolves normally — no disconnect needed. + return { data: { content: "done now" } }; + }), + disconnect, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const prevIdleMs = process.env.GH_AW_SDK_IDLE_MS; + process.env.GH_AW_SDK_IDLE_MS = "20"; + try { + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + expect(result.hasOutput).toBe(true); + expect(result.output).toContain("done now"); + // Only one disconnect: from the finally block (normal completion path). + expect(disconnect).toHaveBeenCalledTimes(1); + } finally { + if (prevIdleMs === undefined) delete process.env.GH_AW_SDK_IDLE_MS; + else process.env.GH_AW_SDK_IDLE_MS = prevIdleMs; + } + }); + + it("post-completion watchdog does not trigger when output not yet collected", async () => { + // The watchdog must not arm when no output has been collected — only + // after the agent has produced real work product. + const disconnect = vi.fn().mockResolvedValue(undefined); + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + + const session = { + sessionId: "session-watchdog-no-output", + on: handler => { + onEvent = handler; + }, + sendAndWait: vi.fn().mockImplementation(async () => { + // Tool completes but no assistant.message yet — watchdog must not arm. + onEvent({ + type: "tool.execution_start", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-early" }, + }); + onEvent({ + type: "tool.execution_complete", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { toolCallId: "call-early", success: true }, + }); + // Now session produces output and completes normally. + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "here is the result" }, + }); + return { data: { content: "here is the result" } }; + }), + disconnect, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const prevIdleMs = process.env.GH_AW_SDK_IDLE_MS; + process.env.GH_AW_SDK_IDLE_MS = "20"; + try { + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + expect(result.hasOutput).toBe(true); + expect(result.output).toContain("here is the result"); + // Disconnect only once: from the normal finally-block cleanup. + expect(disconnect).toHaveBeenCalledTimes(1); + } finally { + if (prevIdleMs === undefined) delete process.env.GH_AW_SDK_IDLE_MS; + else process.env.GH_AW_SDK_IDLE_MS = prevIdleMs; + } + }); + + it("post-completion watchdog does not treat success as failure when sendAndWait resolves before timer fires", async () => { + // When sendAndWait resolves normally before the watchdog fires, the session + // should complete with exitCode 0 and no disconnect from the watchdog. + const disconnect = vi.fn().mockResolvedValue(undefined); + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + + const session = { + sessionId: "session-watchdog-not-needed", + on: handler => { + onEvent = handler; + }, + sendAndWait: vi.fn().mockImplementation(async () => { + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "completed normally" }, + }); + // sendAndWait resolves before watchdog fires (watchdog idle = 500ms in test — + // large enough that normal completion always wins the race on any CI runner). + return { data: { content: "completed normally" } }; + }), + disconnect, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const prevIdleMs = process.env.GH_AW_SDK_IDLE_MS; + process.env.GH_AW_SDK_IDLE_MS = "500"; + try { + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + expect(result.hasOutput).toBe(true); + expect(result.output).toContain("completed normally"); + // Disconnect called once (finally), not twice. + expect(disconnect).toHaveBeenCalledTimes(1); + } finally { + if (prevIdleMs === undefined) delete process.env.GH_AW_SDK_IDLE_MS; + else process.env.GH_AW_SDK_IDLE_MS = prevIdleMs; + } + }); + + it("post-completion watchdog is disarmed during assistant.turn_start → turn_end cycle", async () => { + // When a new turn starts (assistant.turn_start) after the first output is + // produced and all tool calls complete, the watchdog must not fire until + // the turn ends (assistant.turn_end). This prevents a premature + // force-disconnect while the LLM is still doing inference. + const disconnect = vi.fn().mockResolvedValue(undefined); + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + + const session = { + sessionId: "session-watchdog-turn-disarm", + on: handler => { + onEvent = handler; + }, + sendAndWait: vi.fn().mockImplementation(async () => { + // Produce output and complete all tool calls — watchdog would arm here. + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "first result" }, + }); + // New turn starts — must disarm watchdog immediately. + onEvent({ + type: "assistant.turn_start", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { turnId: "turn-2" }, + }); + // Turn ends — watchdog may re-arm now. + onEvent({ + type: "assistant.turn_end", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { turnId: "turn-2" }, + }); + // Additional output and normal resolution. + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: " second result" }, + }); + return { data: { content: " second result" } }; + }), + disconnect, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const prevIdleMs = process.env.GH_AW_SDK_IDLE_MS; + // Short watchdog — if the turn_start guard is missing, the watchdog + // would fire before sendAndWait resolves. 1000ms gives enough headroom + // on slow CI while still detecting a missing guard reliably. + process.env.GH_AW_SDK_IDLE_MS = "1000"; + try { + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + expect(result.hasOutput).toBe(true); + // Disconnect called once (finally), not twice. + expect(disconnect).toHaveBeenCalledTimes(1); + } finally { + if (prevIdleMs === undefined) delete process.env.GH_AW_SDK_IDLE_MS; + else process.env.GH_AW_SDK_IDLE_MS = prevIdleMs; + } + }); + + it("session.task_complete is written to events.jsonl", async () => { + // session.task_complete must be serialized to the JSONL log so that + // unified_timeline.cjs can surface the agent's task summary. + const disconnect = vi.fn().mockResolvedValue(undefined); + const stop = vi.fn().mockResolvedValue(undefined); + let onEvent = () => {}; + const sessionId = "session-task-complete-jsonl"; + + const session = { + sessionId, + on: handler => { + onEvent = handler; + }, + sendAndWait: vi.fn().mockImplementation(async () => { + onEvent({ + type: "assistant.message", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { content: "work done" }, + }); + onEvent({ + type: "session.task_complete", + ephemeral: false, + timestamp: new Date().toISOString(), + data: { success: true, summary: "Created 3 issues successfully" }, + }); + return { data: { content: "work done" } }; + }), + disconnect, + }; + class FakeCopilotClient { + start = vi.fn().mockResolvedValue(undefined); + createSession = vi.fn().mockResolvedValue(session); + stop = stop; + } + + const result = await runWithCopilotSDK({ + sdkUri: "http://127.0.0.1:3002", + prompt: "test prompt", + logger: () => {}, + sdkModule: { + CopilotClient: FakeCopilotClient, + RuntimeConnection: { forUri: vi.fn(() => ({})) }, + approveAll: () => "allow", + }, + }); + + expect(result.exitCode).toBe(0); + + // Read the JSONL and verify the task_complete entry is present. + const eventsPath = path.join(testSessionStateDir, sessionId, "events.jsonl"); + const lines = fs + .readFileSync(eventsPath, "utf8") + .trim() + .split("\n") + .map(l => JSON.parse(l)); + const taskCompleteEvent = lines.find(e => e.type === "session.task_complete"); + expect(taskCompleteEvent).toBeDefined(); + expect(taskCompleteEvent.data.success).toBe(true); + expect(taskCompleteEvent.data.summary).toBe("Created 3 issues successfully"); + }); + it("passes custom provider and model through to SDK createSession", async () => { const disconnect = vi.fn().mockResolvedValue(undefined); const stop = vi.fn().mockResolvedValue(undefined); diff --git a/actions/setup/js/copilot_sdk_session.cjs b/actions/setup/js/copilot_sdk_session.cjs index 9f3129c43f0..be2e6aa5586 100644 --- a/actions/setup/js/copilot_sdk_session.cjs +++ b/actions/setup/js/copilot_sdk_session.cjs @@ -12,6 +12,12 @@ * SDK "tool.execution_start" → JSONL "tool.execution_start" (toolName, mcpServerName, command?) * SDK "tool.execution_complete" → JSONL "tool.execution_complete" (toolName, mcpServerName, success, result) * SDK "assistant.message" → JSONL "assistant.message" (content) + * SDK "assistant.turn_start" → watchdog disarmed (inAssistantTurn = true) + * SDK "assistant.turn_end" → watchdog re-enabled (inAssistantTurn = false) + * SDK "session.task_complete" → JSONL "session.task_complete" (success, summary) + * SDK "subagent.started" → JSONL "subagent.started" (agentName, agentDisplayName, toolCallId) + * SDK "subagent.completed" → JSONL "subagent.completed" (agentName, toolCallId) + * SDK "subagent.failed" → JSONL "subagent.failed" (agentName, toolCallId, error) * * The JSONL file is written to: * /tmp/gh-aw/sandbox/agent/logs/copilot-session-state/{sessionId}/events.jsonl @@ -43,6 +49,15 @@ const SDK_SEND_TIMEOUT_MS_DEFAULT = 10 * 60 * 1000; // Keep in sync with SDK_SESSION_IDLE_TIMEOUT_PATTERN in copilot_harness.cjs. const SDK_IDLE_TIMEOUT_PATTERN = /Timeout after \d+ms waiting for session\.idle/; +// Default idle period for the post-completion watchdog: 5 minutes. +// When the agent has produced output and all tracked tool calls have completed, +// the driver arms a watchdog timer. If no new SDK events arrive within this +// window, the driver force-disconnects the session and treats it as a successful +// completion — covering the SDK driver bug where sendAndWait never resolves after +// the final tool result is returned. +// Override via the GH_AW_SDK_IDLE_MS environment variable. +const SDK_POST_COMPLETION_IDLE_MS_DEFAULT = 5 * 60 * 1000; + /** * Extract the prompt text from a resolved args array. * Looks for the first occurrence of "-p " or "--prompt ". @@ -156,6 +171,23 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c */ const pendingToolCalls = new Map(); + // Post-completion idle watchdog. + // When the agent has produced output and all tracked tool calls have completed, + // this timer is armed. If no new SDK events arrive within GH_AW_SDK_IDLE_MS + // (default 5 minutes), the watchdog force-disconnects the session and the catch + // block treats the result as a successful completion. This bounds the damage + // from the SDK driver bug where sendAndWait never resolves after the final + // tool result is returned. + const postCompletionIdleMs = getEnvPositiveIntOrDefault("GH_AW_SDK_IDLE_MS", SDK_POST_COMPLETION_IDLE_MS_DEFAULT); + let postCompletionWatchdogTriggered = false; + /** @type {ReturnType | null} */ + let postCompletionWatchdog = null; + // Tracks whether the LLM is mid-inference (between assistant.turn_start and + // assistant.turn_end). The watchdog must not fire during this window because + // pendingToolCalls may legitimately be empty before the model dispatches its + // first tool call of the new turn. + let inAssistantTurn = false; + /** * Best-effort write of a driver-level event to events.jsonl and stderr. * @param {string} type @@ -293,10 +325,87 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c break; } + case "assistant.turn_start": + // LLM inference started for a new turn. Disarm the watchdog — the model + // may not have dispatched any tool calls yet, so pendingToolCalls can be + // empty while real work is still in progress. + inAssistantTurn = true; + if (postCompletionWatchdog) { + clearTimeout(postCompletionWatchdog); + postCompletionWatchdog = null; + } + break; + + case "assistant.turn_end": + // LLM inference finished. Allow the watchdog to re-arm on the next event + // that satisfies the completion conditions. + inAssistantTurn = false; + break; + + case "session.task_complete": + writeEvent("session.task_complete", { success: event.data?.success, summary: event.data?.summary }, event.timestamp); + break; + + case "subagent.started": + writeEvent( + "subagent.started", + { + agentName: event.data?.agentName, + agentDisplayName: event.data?.agentDisplayName, + toolCallId: event.data?.toolCallId, + }, + event.timestamp + ); + break; + + case "subagent.completed": + writeEvent("subagent.completed", { agentName: event.data?.agentName, toolCallId: event.data?.toolCallId }, event.timestamp); + break; + + case "subagent.failed": + writeEvent( + "subagent.failed", + { + agentName: event.data?.agentName, + toolCallId: event.data?.toolCallId, + error: event.data?.error, + }, + event.timestamp + ); + break; + default: // Other event types are not consumed by unified_timeline.cjs; skip them. break; } + + // After processing each event, update the post-completion watchdog: + // - Arm (or rearm) the watchdog when the session looks complete: output + // collected, no tool calls still in flight, and not mid-LLM-inference. + // - Disarm the watchdog whenever the session is still mid-turn (a new + // tool call was just started, LLM inference is in progress, or no output yet). + // The watchdog fires only if sendAndWait never resolves on its own after + // the final tool result is returned — the common SDK post-completion hang. + if (hasOutput && pendingToolCalls.size === 0 && !inAssistantTurn) { + if (postCompletionWatchdog) clearTimeout(postCompletionWatchdog); + postCompletionWatchdog = setTimeout(() => { + postCompletionWatchdog = null; + // Re-check conditions at fire time: a new tool call could have started + // or a new turn could have begun between arming the watchdog and the + // timer firing (race condition guard). + if (!hasOutput || pendingToolCalls.size !== 0 || inAssistantTurn || !session) return; + log(`warning: post-completion idle watchdog fired after ${postCompletionIdleMs}ms — force-disconnecting session`); + postCompletionWatchdogTriggered = true; + void session.disconnect().catch(err => { + log(`warning: post-completion watchdog disconnect failed: ${err instanceof Error ? err.message : String(err)}`); + }); + }, postCompletionIdleMs); + } else { + if (postCompletionWatchdog) { + clearTimeout(postCompletionWatchdog); + postCompletionWatchdog = null; + } + } }); log("sending prompt..."); @@ -326,6 +435,15 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c const failure = catastrophicToolDenialsError ?? (err instanceof Error ? err : new Error(String(err))); log(`error: ${failure.message}`); + // When the post-completion idle watchdog force-disconnected the session, the + // agent's work is done — the SDK simply failed to resolve sendAndWait after + // the final tool result was returned. Treat it as a successful completion. + if (postCompletionWatchdogTriggered && !catastrophicToolDenialsError && hasOutput && pendingToolCalls.size === 0) { + log(`warning: post-completion watchdog triggered disconnect — treating as completed`); + log(`session completed: hasOutput=${hasOutput} durationMs=${durationMs}`); + return { exitCode: 0, output, hasOutput, durationMs }; + } + // When sendAndWait times out waiting for session.idle but the agent produced // output and all tracked tool calls have already completed, the session work is // done — the SDK simply failed to emit the idle signal. Treat it as a successful @@ -346,6 +464,11 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c durationMs, }; } finally { + // Clear the post-completion watchdog if it has not already fired. + if (postCompletionWatchdog) { + clearTimeout(postCompletionWatchdog); + postCompletionWatchdog = null; + } // Snapshot for null-safe cleanup in this scope. const stream = eventsStream; if (stream) { @@ -368,4 +491,4 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c } } -module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, SDK_IDLE_TIMEOUT_PATTERN, extractPromptFromArgs, runWithCopilotSDK }; +module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, SDK_POST_COMPLETION_IDLE_MS_DEFAULT, SDK_IDLE_TIMEOUT_PATTERN, extractPromptFromArgs, runWithCopilotSDK };