Skip to content

Commit 6283630

Browse files
author
IM.codes
committed
Speed up stale transport busy recovery
1 parent 497f86b commit 6283630

2 files changed

Lines changed: 35 additions & 2 deletions

File tree

src/agent/transport-session-runtime.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ const MIN_TRANSPORT_STALE_PENDING_RECOVERY_MS = 10_000;
124124
const RECOVERABLE_DISPATCH_RETRY_BASE_MS = 1_000;
125125
const RECOVERABLE_DISPATCH_RETRY_MAX_MS = 8_000;
126126
const MAX_RECOVERABLE_DISPATCH_RETRIES = 15;
127+
// "Provider/session is already busy" while this runtime has no active accepted
128+
// provider turn is almost always a stale SDK-side busy marker. Do not burn the
129+
// full generic retry budget (≈2 minutes): preserve the turn and let
130+
// session-manager relaunch the provider after a few confirmations.
131+
const MAX_RECOVERABLE_BUSY_DISPATCH_RETRIES = 3;
127132
const MAX_TRANSPORT_STALE_PENDING_RECOVERY_MS = 30 * 60_000;
128133
const MIN_TRANSPORT_STALE_PENDING_CANCEL_FALLBACK_MS = 50;
129134
const MAX_TRANSPORT_STALE_PENDING_CANCEL_FALLBACK_MS = 60_000;
@@ -1526,8 +1531,10 @@ export class TransportSessionRuntime implements SessionRuntime {
15261531
// drop the message. Re-queue and auto-retry with backoff so the work
15271532
// completes when the provider frees up; only give up (error) once the
15281533
// bounded retry budget is exhausted (a genuinely wedged provider).
1529-
if (providerError.code !== PROVIDER_ERROR_CODES.CANCELLED
1530-
&& this.requeueAndScheduleRecoverableRetry(providerError)) {
1534+
const isRecoverableBusy = isRecoverableProviderBusyError(providerError);
1535+
const canRetryRecoverable = providerError.code !== PROVIDER_ERROR_CODES.CANCELLED
1536+
&& (!isRecoverableBusy || this._recoverableDispatchRetries < MAX_RECOVERABLE_BUSY_DISPATCH_RETRIES);
1537+
if (canRetryRecoverable && this.requeueAndScheduleRecoverableRetry(providerError)) {
15311538
return;
15321539
}
15331540
this._recoverableDispatchRetries = 0;

test/daemon/transport-session-runtime.test.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,32 @@ describe('TransportSessionRuntime', () => {
883883
});
884884
});
885885

886+
it('uses a short retry budget for stale provider busy instead of the generic recoverable budget', async () => {
887+
const send = mock.provider.send as ReturnType<typeof vi.fn>;
888+
send.mockRejectedValue({
889+
code: PROVIDER_ERROR_CODES.PROVIDER_ERROR,
890+
message: 'Codex SDK session is already busy',
891+
recoverable: true,
892+
});
893+
894+
expect(runtime.send('stale busy should relaunch quickly', 'msg-short-busy')).toBe('sent');
895+
for (let expectedCalls = 1; expectedCalls <= 4; expectedCalls++) {
896+
if (expectedCalls > 1) {
897+
(runtime as unknown as { _lastProviderOutputAt: number })._lastProviderOutputAt = Date.now() - 10_000;
898+
await waitForProviderSendCount(mock.provider, expectedCalls);
899+
} else {
900+
await flushDispatch();
901+
}
902+
}
903+
await flushDispatch();
904+
905+
expect(send).toHaveBeenCalledTimes(4);
906+
expect(runtime.getStatus()).toBe('error');
907+
expect(runtime.pendingEntries).toEqual([
908+
{ clientMessageId: 'msg-short-busy', text: 'stale busy should relaunch quickly' },
909+
]);
910+
});
911+
886912
it('STOP during an auto-retry interrupts only the retried turn and keeps later-queued messages', async () => {
887913
// Regression (audit Medium): STOP must NOT clear the whole queue during an
888914
// auto-retry. Only the turn being retried is dropped; messages the user

0 commit comments

Comments
 (0)