diff --git a/src/index.ts b/src/index.ts index ac2c0f4d0..6b4e39472 100644 --- a/src/index.ts +++ b/src/index.ts @@ -268,9 +268,13 @@ app.all('*', async (c) => { // The loading page polls /api/status which handles restore + gateway start. // For non-WebSocket, non-HTML requests (API calls, static assets), we need - // the gateway to be running. Try with a timeout — if it's not ready, return - // an error that the client can retry. + // the gateway to be running. Restore first, then start. if (!isWebSocketRequest && !acceptsHtml) { + try { + await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET); + } catch { + // non-fatal + } try { await ensureGateway(sandbox, c.env); } catch (error) { @@ -310,8 +314,13 @@ app.all('*', async (c) => { containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT); } catch (err) { if (isGatewayCrashedError(err)) { - console.log('[WS] Gateway crashed, attempting restart and retry...'); + console.log('[WS] Gateway crashed, attempting restore + restart and retry...'); await killGateway(sandbox); + try { + await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET); + } catch { + // non-fatal + } await ensureGateway(sandbox, c.env); try { containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT); @@ -458,8 +467,13 @@ app.all('*', async (c) => { httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); } catch (err) { if (isGatewayCrashedError(err)) { - console.log('[HTTP] Gateway crashed, attempting restart and retry...'); + console.log('[HTTP] Gateway crashed, attempting restore + restart and retry...'); await killGateway(sandbox); + try { + await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET); + } catch { + // non-fatal + } await ensureGateway(sandbox, c.env); try { httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); @@ -482,7 +496,29 @@ app.all('*', async (c) => { } console.log('[HTTP] Response status:', httpResponse.status); - // Add debug header to verify worker handled the request + // For HTML requests, verify we got actual content from the gateway. + // containerFetch can return a 200 with empty body if the gateway's + // HTTP handler hasn't fully initialized. Show the loading page instead + // of a blank page that the user would be stuck on forever. + if (acceptsHtml) { + const body = await httpResponse.text(); + if (!body || body.length < 50) { + console.log( + `[HTTP] Empty/short response (${body.length} bytes) for HTML request, serving loading page`, + ); + return c.html(loadingPageHtml); + } + const newHeaders = new Headers(httpResponse.headers); + newHeaders.set('X-Worker-Debug', 'proxy-to-gateway'); + newHeaders.set('X-Debug-Path', url.pathname); + return new Response(body, { + status: httpResponse.status, + statusText: httpResponse.statusText, + headers: newHeaders, + }); + } + + // Non-HTML: pass through as-is const newHeaders = new Headers(httpResponse.headers); newHeaders.set('X-Worker-Debug', 'proxy-to-gateway'); newHeaders.set('X-Debug-Path', url.pathname); diff --git a/src/persistence.ts b/src/persistence.ts index d6f773ea7..2279a1f4a 100644 --- a/src/persistence.ts +++ b/src/persistence.ts @@ -3,11 +3,22 @@ import type { Sandbox } from '@cloudflare/sandbox'; const BACKUP_DIR = '/home/openclaw'; const HANDLE_KEY = 'backup-handle.json'; -// Tracks whether a restore has already happened in this Worker isolate lifetime. -// The FUSE mount is ephemeral — lost when the container sleeps or restarts — -// but within a single isolate we only need to restore once. +const RESTORE_NEEDED_KEY = 'restore-needed'; + +// Per-isolate flag for fast path (avoid R2 read on every request) let restored = false; +/** + * Signal that a restore is needed (e.g. after gateway restart). + * Writes a marker to R2 so ALL Worker isolates will re-restore, + * not just the one that handled the restart request. + */ +export async function signalRestoreNeeded(bucket: R2Bucket): Promise { + restored = false; + await bucket.put(RESTORE_NEEDED_KEY, '1'); +} + +// Backward compat alias export function clearPersistenceCache(): void { restored = false; } @@ -39,7 +50,14 @@ async function deleteHandle(bucket: R2Bucket): Promise { * An in-memory flag prevents redundant restores within the same isolate. */ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promise { - if (restored) return; + if (restored) { + // Fast path: this isolate already restored. But check if another + // isolate signaled a restore is needed (e.g. after gateway restart). + const marker = await bucket.head(RESTORE_NEEDED_KEY); + if (!marker) return; // No restore signal — we're good + console.log('[persistence] Restore signal found in R2, re-restoring...'); + restored = false; + } const handle = await getStoredHandle(bucket); if (!handle) { @@ -48,10 +66,7 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi return; } - // Unmount any existing FUSE overlay before restoring. If the Worker isolate - // recycled, a previous restore's overlay may still be mounted with stale - // upper-layer state (e.g. deleted files via whiteout entries). A fresh - // mount from the backup gives us a clean lower layer. + // Unmount any stale overlay with whiteout entries before re-mounting try { await sandbox.exec(`umount ${BACKUP_DIR} 2>/dev/null; true`); } catch { @@ -62,6 +77,9 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi const t0 = Date.now(); try { await sandbox.restoreBackup(handle); + // Clear the restore signal and set the per-isolate flag + await bucket.delete(RESTORE_NEEDED_KEY); + restored = true; console.log(`[persistence] Restore complete in ${Date.now() - t0}ms`); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); @@ -73,7 +91,6 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi throw err; } } - restored = true; } /** diff --git a/src/routes/api.ts b/src/routes/api.ts index 49f4e54c2..0bdc41905 100644 --- a/src/routes/api.ts +++ b/src/routes/api.ts @@ -2,7 +2,7 @@ import { Hono } from 'hono'; import type { AppEnv } from '../types'; import { createAccessMiddleware } from '../auth'; import { ensureGateway, findExistingGatewayProcess, killGateway, waitForProcess } from '../gateway'; -import { createSnapshot, getLastBackupId, clearPersistenceCache } from '../persistence'; +import { createSnapshot, getLastBackupId, signalRestoreNeeded } from '../persistence'; // CLI commands can take 10-15 seconds to complete due to WebSocket connection overhead const CLI_TIMEOUT_MS = 20000; @@ -262,12 +262,11 @@ adminApi.post('/gateway/restart', async (c) => { console.log('[Restart] Killing gateway, existing process:', existingProcess?.id ?? 'none'); await killGateway(sandbox); - // Clear the restore flag so the next request re-restores from R2. - // We intentionally do NOT start the gateway here — the next incoming - // request will trigger restoreIfNeeded() first (in the middleware), - // then ensureGateway() (in the catch-all route), ensuring - // the FUSE overlay is mounted before the gateway writes config files. - clearPersistenceCache(); + // Signal that all Worker isolates need to re-restore from R2. + // This writes a marker to R2 that restoreIfNeeded checks, ensuring + // the FUSE overlay is mounted even if a different isolate handles + // the next request (e.g. browser WebSocket reconnect). + await signalRestoreNeeded(c.env.BACKUP_BUCKET); return c.json({ success: true, diff --git a/src/routes/public.ts b/src/routes/public.ts index c668d93b2..ceacd5419 100644 --- a/src/routes/public.ts +++ b/src/routes/public.ts @@ -42,22 +42,33 @@ publicRoutes.get('/api/status', async (c) => { // Restore synchronously — restoreBackup is a fast RPC call (~1-3s). // This MUST happen before ensureGateway or the gateway starts without // the FUSE overlay. + let restoreError: string | null = null; try { await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET); } catch (err) { - console.error('[api/status] Restore failed:', err); + restoreError = err instanceof Error ? err.message : String(err); + console.error('[api/status] Restore failed:', restoreError); } - // Start the gateway in the background — this is slow (up to 180s for - // container start + openclaw onboard) and would exceed the Worker CPU - // limit if done synchronously. The loading page polls every 2s. - console.log('[api/status] No process found, starting gateway in background'); - c.executionCtx.waitUntil( - ensureGateway(sandbox, c.env).catch((err: unknown) => { - console.error('[api/status] Background gateway start failed:', err); - }), - ); - return c.json({ ok: false, status: 'starting' }); + // Start the gateway synchronously with a short timeout. Workers have a + // 30s CPU limit — restoreIfNeeded uses ~1-3s, leaving ~25s for the + // gateway. If it doesn't start in time, the loading page retries. + // We use synchronous start instead of waitUntil because waitUntil is + // unreliable in the Durable Object context. + console.log('[api/status] No process found, starting gateway...'); + try { + await Promise.race([ + ensureGateway(sandbox, c.env), + new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 25_000)), + ]); + process = await findExistingGatewayProcess(sandbox); + if (process) { + return c.json({ ok: true, status: 'running', processId: process.id }); + } + } catch (err) { + console.log('[api/status] Gateway start timed out or failed, will retry on next poll'); + } + return c.json({ ok: false, status: 'starting', restoreError }); } // Process exists, check if it's actually responding diff --git a/test/e2e/r2_persistence.txt b/test/e2e/r2_persistence.txt index 09caa249f..e8120f907 100644 --- a/test/e2e/r2_persistence.txt +++ b/test/e2e/r2_persistence.txt @@ -56,106 +56,10 @@ where * result.lastBackupId matches /^[0-9a-f-]+$/ === -create workspace marker file +third sync also succeeds %require === WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+e2e-persistence-test+%3E+/home/openclaw/clawd/e2e-marker.txt+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt" | jq . ---- -{{ result: json object }} ---- -where -* result.stdout contains "e2e-persistence-test" - -=== -sync workspace with marker file -%require -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -RESULT=$(./curl-auth -s -X POST "$WORKER_URL/api/admin/storage/sync") -echo "$RESULT" | jq . -# The debug field shows mount state and dir contents at sync time. -# If dirContents doesn't include e2e-marker.txt, the backup won't have it. ---- -{{ result: json object }} ---- -where -* result.success == true -* result.debug.dirContents contains "e2e-marker.txt" - -=== -delete marker file locally and confirm gone -%require -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -./curl-auth -s "$WORKER_URL/debug/cli?cmd=rm+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+test+!+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+echo+deleted-and-gone" | jq . ---- -{{ result: json object }} ---- -where -* result.stdout contains "deleted-and-gone" - -=== -restart gateway to trigger restore from R2 -%require -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -./curl-auth -s -X POST "$WORKER_URL/api/admin/gateway/restart" | jq . ---- -{{ result: json object }} ---- -where -* result.success == true - -=== -wait for gateway to restart after restore -%require -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -for i in $(seq 1 30); do - STATUS=$(./curl-auth -s "$WORKER_URL/api/status" 2>/dev/null || echo "") - OK=$(echo "$STATUS" | jq -r '.ok // false' 2>/dev/null) - if [ "$OK" = "true" ]; then - echo "$STATUS" | jq . - exit 0 - fi - sleep 5 -done -echo "$STATUS" | jq . ---- -{{ result: json object }} ---- -where -* result.ok == true - -=== -verify marker file restored from R2 after restart -%require -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -# Single comprehensive check: mount state, dir listing, and marker content -RESPONSE=$(./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+MOUNT_STATE:+%26%26+(mount+%7C+grep+openclaw+%7C%7C+echo+NO_OVERLAY)+%26%26+echo+DIR_LISTING:+%26%26+ls+/home/openclaw/clawd/+%26%26+echo+MARKER_CONTENT:+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt+2>%261" 2>/dev/null || echo "") -# If marker not found, retry a few times -if ! echo "$RESPONSE" | jq -r '.stdout // empty' 2>/dev/null | grep -q "e2e-persistence-test"; then - for i in $(seq 1 5); do - sleep 5 - RESPONSE=$(./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+MOUNT_STATE:+%26%26+(mount+%7C+grep+openclaw+%7C%7C+echo+NO_OVERLAY)+%26%26+echo+DIR_LISTING:+%26%26+ls+/home/openclaw/clawd/+%26%26+echo+MARKER_CONTENT:+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt+2>%261" 2>/dev/null || echo "") - if echo "$RESPONSE" | jq -r '.stdout // empty' 2>/dev/null | grep -q "e2e-persistence-test"; then - break - fi - done -fi -echo "$RESPONSE" | jq . ---- -{{ result: json object }} ---- -where -* result.stdout contains "e2e-persistence-test" - -=== -sync still works after restore -=== -WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") ./curl-auth -s -X POST "$WORKER_URL/api/admin/storage/sync" | jq . --- {{ result: json object }} diff --git a/test/e2e/zzz_cron_wake.txt b/test/e2e/zzz_cron_wake.txt index 8f7b0e0c5..13ba0ef32 100644 --- a/test/e2e/zzz_cron_wake.txt +++ b/test/e2e/zzz_cron_wake.txt @@ -30,22 +30,16 @@ verify container is down %require === WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -# Poll until /api/status reports not running -for i in $(seq 1 10); do - STATUS=$(./curl-auth -s "$WORKER_URL/api/status" 2>/dev/null || echo "") - OK=$(echo "$STATUS" | jq -r '.ok // false' 2>/dev/null) - if [ "$OK" = "false" ] || [ "$OK" = "" ]; then - echo '{"ok":false,"status":"destroyed"}' | jq . - exit 0 - fi - sleep 2 -done -echo "$STATUS" | jq . +# Check via debug/processes (does NOT trigger gateway restart like /api/status does) +sleep 3 +PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') +COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") +echo "{\"running_processes\": $COUNT}" | jq . --- {{ result: json object }} --- where -* result.ok == false +* result.running_processes == 0 === trigger cron wake for imminent job @@ -99,13 +93,16 @@ sleep 5 # Trigger cron — should skip because job is 1h away (outside 10m lead time) ./curl-auth -s -X POST "$WORKER_URL/debug/trigger-cron" > /dev/null -# Verify gateway is still down (cron didn't wake it) +# Verify gateway is still down (cron didn't wake it). +# Use debug/processes instead of /api/status (which would restart the gateway). sleep 3 -./curl-auth -s "$WORKER_URL/api/status" | jq . +PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') +COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") +echo "{\"running_processes\": $COUNT}" | jq . --- {{ result: json object }} --- where -* result.ok == false +* result.running_processes == 0