Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 41 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,13 @@ app.all('*', async (c) => {
// The loading page polls /api/status which handles restore + gateway start.

// For non-WebSocket, non-HTML requests (API calls, static assets), we need
// the gateway to be running. Try with a timeout — if it's not ready, return
// an error that the client can retry.
// the gateway to be running. Restore first, then start.
if (!isWebSocketRequest && !acceptsHtml) {
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch {
// non-fatal
}
try {
await ensureGateway(sandbox, c.env);
} catch (error) {
Expand Down Expand Up @@ -310,8 +314,13 @@ app.all('*', async (c) => {
containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT);
} catch (err) {
if (isGatewayCrashedError(err)) {
console.log('[WS] Gateway crashed, attempting restart and retry...');
console.log('[WS] Gateway crashed, attempting restore + restart and retry...');
await killGateway(sandbox);
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch {
// non-fatal
}
await ensureGateway(sandbox, c.env);
try {
containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT);
Expand Down Expand Up @@ -458,8 +467,13 @@ app.all('*', async (c) => {
httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT);
} catch (err) {
if (isGatewayCrashedError(err)) {
console.log('[HTTP] Gateway crashed, attempting restart and retry...');
console.log('[HTTP] Gateway crashed, attempting restore + restart and retry...');
await killGateway(sandbox);
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch {
// non-fatal
}
await ensureGateway(sandbox, c.env);
try {
httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT);
Expand All @@ -482,7 +496,29 @@ app.all('*', async (c) => {
}
console.log('[HTTP] Response status:', httpResponse.status);

// Add debug header to verify worker handled the request
// For HTML requests, verify we got actual content from the gateway.
// containerFetch can return a 200 with empty body if the gateway's
// HTTP handler hasn't fully initialized. Show the loading page instead
// of a blank page that the user would be stuck on forever.
if (acceptsHtml) {
const body = await httpResponse.text();
if (!body || body.length < 50) {
console.log(
`[HTTP] Empty/short response (${body.length} bytes) for HTML request, serving loading page`,
);
return c.html(loadingPageHtml);
}
const newHeaders = new Headers(httpResponse.headers);
newHeaders.set('X-Worker-Debug', 'proxy-to-gateway');
newHeaders.set('X-Debug-Path', url.pathname);
return new Response(body, {
status: httpResponse.status,
statusText: httpResponse.statusText,
headers: newHeaders,
});
}

// Non-HTML: pass through as-is
const newHeaders = new Headers(httpResponse.headers);
newHeaders.set('X-Worker-Debug', 'proxy-to-gateway');
newHeaders.set('X-Debug-Path', url.pathname);
Expand Down
35 changes: 26 additions & 9 deletions src/persistence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,22 @@ import type { Sandbox } from '@cloudflare/sandbox';
const BACKUP_DIR = '/home/openclaw';
const HANDLE_KEY = 'backup-handle.json';

// Tracks whether a restore has already happened in this Worker isolate lifetime.
// The FUSE mount is ephemeral — lost when the container sleeps or restarts —
// but within a single isolate we only need to restore once.
const RESTORE_NEEDED_KEY = 'restore-needed';

// Per-isolate flag for fast path (avoid R2 read on every request)
let restored = false;

/**
* Signal that a restore is needed (e.g. after gateway restart).
* Writes a marker to R2 so ALL Worker isolates will re-restore,
* not just the one that handled the restart request.
*/
export async function signalRestoreNeeded(bucket: R2Bucket): Promise<void> {
restored = false;
await bucket.put(RESTORE_NEEDED_KEY, '1');
}

// Backward compat alias
export function clearPersistenceCache(): void {
restored = false;
}
Expand Down Expand Up @@ -39,7 +50,14 @@ async function deleteHandle(bucket: R2Bucket): Promise<void> {
* An in-memory flag prevents redundant restores within the same isolate.
*/
export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promise<void> {
if (restored) return;
if (restored) {
// Fast path: this isolate already restored. But check if another
// isolate signaled a restore is needed (e.g. after gateway restart).
const marker = await bucket.head(RESTORE_NEEDED_KEY);
if (!marker) return; // No restore signal — we're good
console.log('[persistence] Restore signal found in R2, re-restoring...');
restored = false;
}

const handle = await getStoredHandle(bucket);
if (!handle) {
Expand All @@ -48,10 +66,7 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi
return;
}

// Unmount any existing FUSE overlay before restoring. If the Worker isolate
// recycled, a previous restore's overlay may still be mounted with stale
// upper-layer state (e.g. deleted files via whiteout entries). A fresh
// mount from the backup gives us a clean lower layer.
// Unmount any stale overlay with whiteout entries before re-mounting
try {
await sandbox.exec(`umount ${BACKUP_DIR} 2>/dev/null; true`);
} catch {
Expand All @@ -62,6 +77,9 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi
const t0 = Date.now();
try {
await sandbox.restoreBackup(handle);
// Clear the restore signal and set the per-isolate flag
await bucket.delete(RESTORE_NEEDED_KEY);
restored = true;
console.log(`[persistence] Restore complete in ${Date.now() - t0}ms`);
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : String(err);
Expand All @@ -73,7 +91,6 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi
throw err;
}
}
restored = true;
}

/**
Expand Down
13 changes: 6 additions & 7 deletions src/routes/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { Hono } from 'hono';
import type { AppEnv } from '../types';
import { createAccessMiddleware } from '../auth';
import { ensureGateway, findExistingGatewayProcess, killGateway, waitForProcess } from '../gateway';
import { createSnapshot, getLastBackupId, clearPersistenceCache } from '../persistence';
import { createSnapshot, getLastBackupId, signalRestoreNeeded } from '../persistence';

// CLI commands can take 10-15 seconds to complete due to WebSocket connection overhead
const CLI_TIMEOUT_MS = 20000;
Expand Down Expand Up @@ -262,12 +262,11 @@ adminApi.post('/gateway/restart', async (c) => {
console.log('[Restart] Killing gateway, existing process:', existingProcess?.id ?? 'none');
await killGateway(sandbox);

// Clear the restore flag so the next request re-restores from R2.
// We intentionally do NOT start the gateway here — the next incoming
// request will trigger restoreIfNeeded() first (in the middleware),
// then ensureGateway() (in the catch-all route), ensuring
// the FUSE overlay is mounted before the gateway writes config files.
clearPersistenceCache();
// Signal that all Worker isolates need to re-restore from R2.
// This writes a marker to R2 that restoreIfNeeded checks, ensuring
// the FUSE overlay is mounted even if a different isolate handles
// the next request (e.g. browser WebSocket reconnect).
await signalRestoreNeeded(c.env.BACKUP_BUCKET);

return c.json({
success: true,
Expand Down
33 changes: 22 additions & 11 deletions src/routes/public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,33 @@ publicRoutes.get('/api/status', async (c) => {
// Restore synchronously — restoreBackup is a fast RPC call (~1-3s).
// This MUST happen before ensureGateway or the gateway starts without
// the FUSE overlay.
let restoreError: string | null = null;
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch (err) {
console.error('[api/status] Restore failed:', err);
restoreError = err instanceof Error ? err.message : String(err);
console.error('[api/status] Restore failed:', restoreError);
}

// Start the gateway in the background — this is slow (up to 180s for
// container start + openclaw onboard) and would exceed the Worker CPU
// limit if done synchronously. The loading page polls every 2s.
console.log('[api/status] No process found, starting gateway in background');
c.executionCtx.waitUntil(
ensureGateway(sandbox, c.env).catch((err: unknown) => {
console.error('[api/status] Background gateway start failed:', err);
}),
);
return c.json({ ok: false, status: 'starting' });
// Start the gateway synchronously with a short timeout. Workers have a
// 30s CPU limit — restoreIfNeeded uses ~1-3s, leaving ~25s for the
// gateway. If it doesn't start in time, the loading page retries.
// We use synchronous start instead of waitUntil because waitUntil is
// unreliable in the Durable Object context.
console.log('[api/status] No process found, starting gateway...');
try {
await Promise.race([
ensureGateway(sandbox, c.env),
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 25_000)),
]);
process = await findExistingGatewayProcess(sandbox);
if (process) {
return c.json({ ok: true, status: 'running', processId: process.id });
}
} catch (err) {
console.log('[api/status] Gateway start timed out or failed, will retry on next poll');
}
return c.json({ ok: false, status: 'starting', restoreError });
}

// Process exists, check if it's actually responding
Expand Down
98 changes: 1 addition & 97 deletions test/e2e/r2_persistence.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,106 +56,10 @@ where
* result.lastBackupId matches /^[0-9a-f-]+$/

===
create workspace marker file
third sync also succeeds
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+e2e-persistence-test+%3E+/home/openclaw/clawd/e2e-marker.txt+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt" | jq .
---
{{ result: json object }}
---
where
* result.stdout contains "e2e-persistence-test"

===
sync workspace with marker file
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
RESULT=$(./curl-auth -s -X POST "$WORKER_URL/api/admin/storage/sync")
echo "$RESULT" | jq .
# The debug field shows mount state and dir contents at sync time.
# If dirContents doesn't include e2e-marker.txt, the backup won't have it.
---
{{ result: json object }}
---
where
* result.success == true
* result.debug.dirContents contains "e2e-marker.txt"

===
delete marker file locally and confirm gone
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
./curl-auth -s "$WORKER_URL/debug/cli?cmd=rm+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+test+!+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+echo+deleted-and-gone" | jq .
---
{{ result: json object }}
---
where
* result.stdout contains "deleted-and-gone"

===
restart gateway to trigger restore from R2
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
./curl-auth -s -X POST "$WORKER_URL/api/admin/gateway/restart" | jq .
---
{{ result: json object }}
---
where
* result.success == true

===
wait for gateway to restart after restore
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
for i in $(seq 1 30); do
STATUS=$(./curl-auth -s "$WORKER_URL/api/status" 2>/dev/null || echo "")
OK=$(echo "$STATUS" | jq -r '.ok // false' 2>/dev/null)
if [ "$OK" = "true" ]; then
echo "$STATUS" | jq .
exit 0
fi
sleep 5
done
echo "$STATUS" | jq .
---
{{ result: json object }}
---
where
* result.ok == true

===
verify marker file restored from R2 after restart
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
# Single comprehensive check: mount state, dir listing, and marker content
RESPONSE=$(./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+MOUNT_STATE:+%26%26+(mount+%7C+grep+openclaw+%7C%7C+echo+NO_OVERLAY)+%26%26+echo+DIR_LISTING:+%26%26+ls+/home/openclaw/clawd/+%26%26+echo+MARKER_CONTENT:+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt+2>%261" 2>/dev/null || echo "")
# If marker not found, retry a few times
if ! echo "$RESPONSE" | jq -r '.stdout // empty' 2>/dev/null | grep -q "e2e-persistence-test"; then
for i in $(seq 1 5); do
sleep 5
RESPONSE=$(./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+MOUNT_STATE:+%26%26+(mount+%7C+grep+openclaw+%7C%7C+echo+NO_OVERLAY)+%26%26+echo+DIR_LISTING:+%26%26+ls+/home/openclaw/clawd/+%26%26+echo+MARKER_CONTENT:+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt+2>%261" 2>/dev/null || echo "")
if echo "$RESPONSE" | jq -r '.stdout // empty' 2>/dev/null | grep -q "e2e-persistence-test"; then
break
fi
done
fi
echo "$RESPONSE" | jq .
---
{{ result: json object }}
---
where
* result.stdout contains "e2e-persistence-test"

===
sync still works after restore
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
./curl-auth -s -X POST "$WORKER_URL/api/admin/storage/sync" | jq .
---
{{ result: json object }}
Expand Down
27 changes: 12 additions & 15 deletions test/e2e/zzz_cron_wake.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,16 @@ verify container is down
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
# Poll until /api/status reports not running
for i in $(seq 1 10); do
STATUS=$(./curl-auth -s "$WORKER_URL/api/status" 2>/dev/null || echo "")
OK=$(echo "$STATUS" | jq -r '.ok // false' 2>/dev/null)
if [ "$OK" = "false" ] || [ "$OK" = "" ]; then
echo '{"ok":false,"status":"destroyed"}' | jq .
exit 0
fi
sleep 2
done
echo "$STATUS" | jq .
# Check via debug/processes (does NOT trigger gateway restart like /api/status does)
sleep 3
PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}')
COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0")
echo "{\"running_processes\": $COUNT}" | jq .
---
{{ result: json object }}
---
where
* result.ok == false
* result.running_processes == 0

===
trigger cron wake for imminent job
Expand Down Expand Up @@ -99,13 +93,16 @@ sleep 5
# Trigger cron — should skip because job is 1h away (outside 10m lead time)
./curl-auth -s -X POST "$WORKER_URL/debug/trigger-cron" > /dev/null

# Verify gateway is still down (cron didn't wake it)
# Verify gateway is still down (cron didn't wake it).
# Use debug/processes instead of /api/status (which would restart the gateway).
sleep 3
./curl-auth -s "$WORKER_URL/api/status" | jq .
PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}')
COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0")
echo "{\"running_processes\": $COUNT}" | jq .
---
{{ result: json object }}
---
where
* result.ok == false
* result.running_processes == 0


Loading