Skip to content

Commit 7d63525

Browse files
nficanoclaude
andcommitted
fix(runtime): emit LEASE_EXPIRED terminal on lease watchdog expiry (§9.5)
The lease watchdog used to emit a synthetic `tool_result.error` with a fake `call_id` (`lease_{job_id}`) — that doesn't correspond to any prior `tool_call`, so any subscriber correlating tool_call→tool_result saw a dangling reference. Worse, after that it cancelled the job, which routed the run-loop's `OperationCanceledException` catch to `job.error{code:CANCELLED, final_status:"cancelled"}` rather than the spec-required `LEASE_EXPIRED` / `final_status:"error"`. - Add `Job.MarkLeaseExpired()` / `Job.LeaseExpired` analogous to the existing `MarkRuntimeLimitExceeded`. - Have `RunLeaseWatchdog` emit a `status{phase:"lease_expired"}` event (well-known phase added to `StatusPhases`), set the flag, then cancel. - Branch the run-loop's `OperationCanceledException` catch on the flag and emit `job.error{LEASE_EXPIRED, final_status:"error", retryable:false}`. - Update the LeaseExpiresAt sample header comment to match what the runtime now delivers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 858b3c5 commit 7d63525

7 files changed

Lines changed: 68 additions & 10 deletions

File tree

samples/LeaseExpiresAt/Program.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// samples/LeaseExpiresAt: a short-lived lease expires while the agent is running; the runtime
3-
// emits tool_result.error{LEASE_EXPIRED} and then job.error{LEASE_EXPIRED}. Spec §9.5, §13.4.
3+
// emits a status{phase:"lease_expired"} event and then job.error{LEASE_EXPIRED,
4+
// final_status:"error"}. Spec §9.5.
45
using Arcp.Client;
56
using Arcp.Core.Leases;
67
using Arcp.Core.Messages;

src/Arcp.Core/Messages/StatusPhases.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,8 @@ public static class StatusPhases
77
{
88
/// <summary>Gets the credential rotated.</summary>
99
public const string CredentialRotated = "credential_rotated";
10+
11+
/// <summary>Emitted by the runtime when a job's lease has expired (spec §9.5). The terminal
12+
/// <c>job.error</c> with code <c>LEASE_EXPIRED</c> and <c>final_status:"error"</c> follows.</summary>
13+
public const string LeaseExpired = "lease_expired";
1014
}

src/Arcp.Core/PublicAPI.Unshipped.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,7 @@ const Arcp.Core.Messages.MessageTypeNames.SessionPong = "session.pong" -> string
719719
const Arcp.Core.Messages.MessageTypeNames.SessionResume = "session.resume" -> string!
720720
const Arcp.Core.Messages.MessageTypeNames.SessionWelcome = "session.welcome" -> string!
721721
const Arcp.Core.Messages.StatusPhases.CredentialRotated = "credential_rotated" -> string!
722+
const Arcp.Core.Messages.StatusPhases.LeaseExpired = "lease_expired" -> string!
722723
const Arcp.Core.Tracing.ArcpDiagnostics.RuntimeSourceName = "Arcp.Runtime" -> string!
723724
const Arcp.Core.Tracing.ArcpDiagnostics.TransportSourceName = "Arcp.Transport" -> string!
724725
const Arcp.Core.Tracing.TraceAttributes.Agent = "arcp.agent" -> string!

src/Arcp.Runtime/Job.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ public sealed class Job
9191

9292
internal void MarkRuntimeLimitExceeded() => RuntimeLimitExceeded = true;
9393

94+
/// <summary>Set by the lease watchdog when <c>lease_constraints.expires_at</c> elapses so the
95+
/// run-loop can surface <c>LEASE_EXPIRED</c> / <c>final_status:"error"</c> instead of
96+
/// <c>CANCELLED</c> (spec §9.5).</summary>
97+
public bool LeaseExpired { get; private set; }
98+
99+
internal void MarkLeaseExpired() => LeaseExpired = true;
100+
94101
/// <summary>Gets the credentials.</summary>
95102
public IReadOnlyList<ProvisionedCredential> Credentials
96103
{

src/Arcp.Runtime/JobManager.cs

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,17 @@ public async Task RunAsync(Job job, IAgent agent, Func<Envelope, CancellationTok
207207
await EmitTimeoutAsync(job, emit, cancellationToken).ConfigureAwait(false);
208208
job.MarkTerminal(JobStatus.TimedOut);
209209
}
210+
else if (job.LeaseExpired)
211+
{
212+
await EmitJobErrorAsync(job, emit, new JobErrorPayload
213+
{
214+
FinalStatus = "error",
215+
Code = ErrorCode.LeaseExpired,
216+
Message = "Lease expired",
217+
Retryable = false,
218+
}, cancellationToken).ConfigureAwait(false);
219+
job.MarkTerminal(JobStatus.Error);
220+
}
210221
else
211222
{
212223
await EmitJobErrorAsync(job, emit, new JobErrorPayload
@@ -358,17 +369,15 @@ private async Task RunLeaseWatchdog(Job job, DateTimeOffset expiresAt, Func<Enve
358369
if (cancellationToken.IsCancellationRequested || IsTerminal(job.Status)) return;
359370
if (!job.CancellationToken.IsCancellationRequested)
360371
{
361-
// Emit a tool_result.error then job.error per spec §13.4.
362-
await job.EmitEventAsync(EventKinds.ToolResult, new ToolResultBody
372+
// Surface lease expiry as a status event (spec §9.5); the terminal
373+
// job.error{LEASE_EXPIRED, final_status:"error"} is emitted by the run-loop
374+
// once cancellation unwinds the agent.
375+
await job.EmitEventAsync(EventKinds.Status, new StatusBody
363376
{
364-
CallId = $"lease_{job.JobId.Value}",
365-
Error = new ToolError
366-
{
367-
Code = ErrorCode.LeaseExpired,
368-
Message = $"Lease expired at {expiresAt:O}",
369-
Retryable = false,
370-
},
377+
Phase = StatusPhases.LeaseExpired,
378+
Message = $"Lease expired at {expiresAt:O}",
371379
}, cancellationToken).ConfigureAwait(false);
380+
job.MarkLeaseExpired();
372381
job.CancellationSource.Cancel();
373382
}
374383
}

src/Arcp.Runtime/PublicAPI.Unshipped.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ Arcp.Runtime.Job.Credentials.get -> System.Collections.Generic.IReadOnlyList<Arc
197197
Arcp.Runtime.JobContext.Credentials.get -> System.Collections.Generic.IReadOnlyList<Arcp.Core.Messages.ProvisionedCredential!>!
198198
Arcp.Runtime.JobContext.RotateCredentialAsync(string! credentialId, Arcp.Core.Messages.ProvisionedCredential! next, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.ValueTask
199199
Arcp.Runtime.JobManager.JobManager(Arcp.Runtime.Agents.AgentRegistry! agents, Arcp.Runtime.Leases.LeaseManager! leases, System.TimeProvider! time, Microsoft.Extensions.Logging.ILoggerFactory! loggers, Arcp.Runtime.Credentials.CredentialManager? credentials = null, int idempotencyWindowSec = 3600) -> void
200+
Arcp.Runtime.Job.LeaseExpired.get -> bool
200201
Arcp.Runtime.Job.MaxRuntimeSec.get -> int?
201202
Arcp.Runtime.Job.RuntimeLimitExceeded.get -> bool
202203
Arcp.Runtime.ArcpServerOptions.IdempotencyWindowSec.get -> int

tests/Arcp.IntegrationTests/RuntimeLimitTests.cs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,39 @@ public async Task Job_finishing_before_lease_expiry_does_not_emit_late_lease_exp
7373
sw.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(3),
7474
because: "the watchdog must not keep the run task alive until lease expiry");
7575
}
76+
77+
[Fact]
78+
public async Task Lease_expiry_terminates_job_with_LEASE_EXPIRED_error_status()
79+
{
80+
// Spec §9.5: lease expiry yields `job.error{code: LEASE_EXPIRED, final_status: "error"}`,
81+
// not `cancelled` — a separate terminal from the `job.cancel`-driven CANCELLED path.
82+
var server = new ArcpServer(new ArcpServerOptions
83+
{
84+
Runtime = new RuntimeInfo { Name = "test-runtime", Version = "1.0.0" },
85+
});
86+
server.RegisterAgent("worker", async (ctx, ct) =>
87+
{
88+
try { await Task.Delay(TimeSpan.FromSeconds(10), ct); }
89+
catch (OperationCanceledException) { throw; }
90+
return null;
91+
});
92+
var (client, srv) = MemoryTransport.Pair();
93+
_ = Task.Run(() => server.AcceptAsync(srv));
94+
95+
await using var c = await ArcpClient.ConnectAsync(client, new ArcpClientOptions
96+
{
97+
Client = new ClientInfo { Name = "test", Version = "1.0" },
98+
});
99+
100+
var handle = await c.SubmitAsync("worker", leaseConstraints: new Arcp.Core.Leases.LeaseConstraints
101+
{
102+
ExpiresAt = DateTimeOffset.UtcNow.AddMilliseconds(500),
103+
});
104+
var result = await handle.Result.WaitAsync(TimeSpan.FromSeconds(5));
105+
106+
result.Success.Should().BeFalse();
107+
result.Error!.Code.Should().Be(Arcp.Core.Errors.ErrorCode.LeaseExpired);
108+
result.Error.FinalStatus.Should().Be("error");
109+
result.Error.Retryable.Should().BeFalse();
110+
}
76111
}

0 commit comments

Comments
 (0)