|
| 1 | +/** |
| 2 | + * E2E integration test for the daemon auth handshake race. |
| 3 | + * |
| 4 | + * Production observation (78 server, 2026-05-11): a single daemon was |
| 5 | + * authenticating ~5 times per 10 seconds, and the daemon side reported |
| 6 | + * `code:4001 reason:auth_required` on every cycle. The user-visible |
| 7 | + * symptom was "server 重启 → daemon reconnect 极慢" plus a permanent |
| 8 | + * "DAEMON 失联" banner that survived all earlier client/UI fixes. |
| 9 | + * |
| 10 | + * Root cause was a race in `WsBridge.handleDaemonConnection`'s async |
| 11 | + * message handler. The daemon sends `auth` immediately followed by |
| 12 | + * `daemon.hello` on every WS open. Both messages reach the server |
| 13 | + * before the auth handler's `await db.queryOne(...)` settles. While |
| 14 | + * the auth flow is parked at the DB await, `this.authenticated` is |
| 15 | + * still `false`, so the `daemon.hello` handler hits |
| 16 | + * `ws.close(4001, 'auth_required')` and kills the freshly-opened |
| 17 | + * connection. |
| 18 | + * |
| 19 | + * The mocked unit test in `bridge.test.ts` covers the deferred-DB |
| 20 | + * scenario, but mocks cannot guarantee the same message-ordering |
| 21 | + * semantics the real `ws` server stack exhibits. This file spins up |
| 22 | + * an in-process `http.Server` + `WebSocketServer` and connects real |
| 23 | + * `ws` clients so the race window is exercised end-to-end. |
| 24 | + * |
| 25 | + * Stability guarantees: |
| 26 | + * 1. Single back-to-back `auth + daemon.hello` flow: connection |
| 27 | + * stays open, auth completes, hello is processed. |
| 28 | + * 2. Burst-reconnect resilience: 10 sequential reconnect cycles |
| 29 | + * complete without a single 4001 close — simulating the |
| 30 | + * production "server restart" reconnect cascade. |
| 31 | + * 3. Slow-DB resilience: even with a 50 ms artificial DB delay |
| 32 | + * (worst-case for the race window), the bug remains fixed. |
| 33 | + */ |
| 34 | + |
| 35 | +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; |
| 36 | +import { createServer, type Server as HttpServer } from 'node:http'; |
| 37 | +import { WebSocket, WebSocketServer } from 'ws'; |
| 38 | +import { AddressInfo } from 'node:net'; |
| 39 | +import { WsBridge } from '../src/ws/bridge.js'; |
| 40 | +import { P2P_WORKFLOW_MSG } from '../../shared/p2p-workflow-messages.js'; |
| 41 | +import { P2P_WORKFLOW_CAPABILITY_V1 } from '../../shared/p2p-workflow-constants.js'; |
| 42 | + |
| 43 | +// ── Mocks ───────────────────────────────────────────────────────────────────── |
| 44 | +// Match the existing bridge.test.ts crypto stub so the auth path validates. |
| 45 | +import { vi } from 'vitest'; |
| 46 | + |
| 47 | +vi.mock('../src/security/crypto.js', () => ({ |
| 48 | + sha256Hex: (_s: string) => 'valid-hash', |
| 49 | +})); |
| 50 | + |
| 51 | +vi.mock('../src/routes/push.js', () => ({ |
| 52 | + dispatchPush: vi.fn(), |
| 53 | +})); |
| 54 | + |
| 55 | +// ── Helpers ─────────────────────────────────────────────────────────────────── |
| 56 | + |
| 57 | +interface DeferredDb { |
| 58 | + /** Override the DB query latency for the next handshake (ms). 0 = synchronous. */ |
| 59 | + setLatency(ms: number): void; |
| 60 | + db: import('../src/db/client.js').Database; |
| 61 | +} |
| 62 | + |
| 63 | +function makeDeferredDb(tokenHash: string): DeferredDb { |
| 64 | + let latency = 0; |
| 65 | + const queryOne = async <T = unknown>(): Promise<T | null> => { |
| 66 | + if (latency > 0) await new Promise((r) => setTimeout(r, latency)); |
| 67 | + return { token_hash: tokenHash } as T; |
| 68 | + }; |
| 69 | + const db = { |
| 70 | + queryOne, |
| 71 | + query: async () => [], |
| 72 | + execute: async () => ({ changes: 1 }), |
| 73 | + exec: async () => {}, |
| 74 | + transaction: async <T>(fn: (tx: import('../src/db/client.js').Database) => Promise<T>) => |
| 75 | + fn(db as unknown as import('../src/db/client.js').Database), |
| 76 | + close: () => {}, |
| 77 | + }; |
| 78 | + return { |
| 79 | + setLatency: (ms: number) => { latency = ms; }, |
| 80 | + db: db as unknown as import('../src/db/client.js').Database, |
| 81 | + }; |
| 82 | +} |
| 83 | + |
| 84 | +interface ConnectionOutcome { |
| 85 | + /** Whether the WS closed at any point during the test window. */ |
| 86 | + closed: boolean; |
| 87 | + /** Close code, if any. */ |
| 88 | + closeCode?: number; |
| 89 | + /** Close reason, if any. */ |
| 90 | + closeReason?: string; |
| 91 | + /** Messages received from server (parsed JSON). */ |
| 92 | + received: Array<Record<string, unknown>>; |
| 93 | + /** Snapshot of `bridge.isAuthenticated` taken AFTER the observe window |
| 94 | + * but BEFORE the test closes the socket. We must capture it here |
| 95 | + * because the bridge's ws.on('close') handler resets `authenticated` |
| 96 | + * to false — checking after the local close would always observe |
| 97 | + * false even on a successful auth. */ |
| 98 | + authenticatedDuringWindow: boolean; |
| 99 | +} |
| 100 | + |
| 101 | +/** |
| 102 | + * Drive the production daemon handshake (`auth` followed immediately by |
| 103 | + * `daemon.hello`) over a real `ws` client and report the outcome after |
| 104 | + * `observeMs`. |
| 105 | + */ |
| 106 | +async function driveDaemonHandshake( |
| 107 | + url: string, |
| 108 | + serverId: string, |
| 109 | + token: string, |
| 110 | + observeMs: number, |
| 111 | + observeAuth?: () => boolean, |
| 112 | +): Promise<ConnectionOutcome> { |
| 113 | + const ws = new WebSocket(url); |
| 114 | + const outcome: ConnectionOutcome = { closed: false, received: [], authenticatedDuringWindow: false }; |
| 115 | + |
| 116 | + await new Promise<void>((resolve, reject) => { |
| 117 | + const timer = setTimeout(() => reject(new Error('open timeout')), 2_000); |
| 118 | + ws.once('open', () => { clearTimeout(timer); resolve(); }); |
| 119 | + ws.once('error', (err) => { clearTimeout(timer); reject(err); }); |
| 120 | + }); |
| 121 | + |
| 122 | + ws.on('message', (raw) => { |
| 123 | + try { |
| 124 | + outcome.received.push(JSON.parse(raw.toString()) as Record<string, unknown>); |
| 125 | + } catch { /* ignore */ } |
| 126 | + }); |
| 127 | + ws.on('close', (code, reason) => { |
| 128 | + outcome.closed = true; |
| 129 | + outcome.closeCode = code; |
| 130 | + outcome.closeReason = reason.toString(); |
| 131 | + }); |
| 132 | + |
| 133 | + // Production daemon order: auth IMMEDIATELY followed by daemon.hello. |
| 134 | + // Both messages hit the server's async message handler before the |
| 135 | + // auth's DB query resolves — this is the race window. |
| 136 | + ws.send(JSON.stringify({ type: 'auth', serverId, token, daemonVersion: 'test-version' })); |
| 137 | + ws.send(JSON.stringify({ |
| 138 | + type: P2P_WORKFLOW_MSG.DAEMON_HELLO, |
| 139 | + daemonId: serverId, |
| 140 | + capabilities: [P2P_WORKFLOW_CAPABILITY_V1], |
| 141 | + helloEpoch: 1, |
| 142 | + sentAt: Date.now(), |
| 143 | + })); |
| 144 | + |
| 145 | + // Poll for authenticated state OR until observeMs elapses, whichever |
| 146 | + // comes first. Polling is more robust than a single sleep+check |
| 147 | + // because the WS round-trip + DB latency can vary by tens of ms in |
| 148 | + // CI. Captures auth state BEFORE we close the socket — the bridge's |
| 149 | + // ws.on('close') handler resets `authenticated` to false, so |
| 150 | + // checking after the local close would always observe false. |
| 151 | + const deadline = Date.now() + observeMs; |
| 152 | + if (observeAuth) { |
| 153 | + while (Date.now() < deadline) { |
| 154 | + if (observeAuth()) { |
| 155 | + outcome.authenticatedDuringWindow = true; |
| 156 | + break; |
| 157 | + } |
| 158 | + // Don't busy-loop — yield once per 10 ms so the bridge's async |
| 159 | + // message handler can run. |
| 160 | + await new Promise((r) => setTimeout(r, 10)); |
| 161 | + } |
| 162 | + // If the loop fell through without observing auth, leave the flag |
| 163 | + // false so the assertion fails with diagnostic context. |
| 164 | + } else { |
| 165 | + await new Promise((r) => setTimeout(r, observeMs)); |
| 166 | + } |
| 167 | + if (ws.readyState === WebSocket.OPEN) ws.close(1000, 'test_done'); |
| 168 | + return outcome; |
| 169 | +} |
| 170 | + |
| 171 | +// ── Test fixture ────────────────────────────────────────────────────────────── |
| 172 | + |
| 173 | +describe('WsBridge daemon auth-handshake race — e2e (real ws server)', () => { |
| 174 | + let httpServer: HttpServer; |
| 175 | + let wss: WebSocketServer; |
| 176 | + let port: number; |
| 177 | + let deferredDb: DeferredDb; |
| 178 | + const TOKEN = 'my-token'; |
| 179 | + // Each test/cycle gets its OWN server ID. `WsBridge.maybeCleanup` |
| 180 | + // deletes from the shared instances map by `serverId`, not by |
| 181 | + // instance pointer; if a prior test's connection close fires its |
| 182 | + // cleanup AFTER a new bridge has registered for the same serverId, |
| 183 | + // the new bridge gets evicted from the map and the daemon |
| 184 | + // connection becomes unreachable. In production every serverId |
| 185 | + // hosts a single bridge so the path is harmless, but back-to-back |
| 186 | + // tests rapid-cycle the same id and trip it. Generating fresh ids |
| 187 | + // sidesteps the cross-test eviction. |
| 188 | + const newServerId = (): string => |
| 189 | + `e2e-${Math.random().toString(36).slice(2, 10)}-${Date.now().toString(36)}`; |
| 190 | + |
| 191 | + beforeAll(async () => { |
| 192 | + deferredDb = makeDeferredDb('valid-hash'); |
| 193 | + httpServer = createServer(); |
| 194 | + wss = new WebSocketServer({ noServer: true }); |
| 195 | + |
| 196 | + httpServer.on('upgrade', (req, socket, head) => { |
| 197 | + // Extract the serverId from the URL path so each test's |
| 198 | + // connection lands on the right bridge instance even when tests |
| 199 | + // run back-to-back with overlapping close handlers. |
| 200 | + const url = req.url ?? ''; |
| 201 | + const match = url.match(/\/api\/server\/([^/]+)\/ws/); |
| 202 | + const serverId = match?.[1]; |
| 203 | + if (!serverId) { socket.destroy(); return; } |
| 204 | + wss.handleUpgrade(req, socket, head, (ws) => { |
| 205 | + WsBridge.get(serverId).handleDaemonConnection( |
| 206 | + ws as never, |
| 207 | + deferredDb.db, |
| 208 | + {} as never, |
| 209 | + ); |
| 210 | + }); |
| 211 | + }); |
| 212 | + |
| 213 | + await new Promise<void>((resolve) => { |
| 214 | + httpServer.listen(0, '127.0.0.1', () => resolve()); |
| 215 | + }); |
| 216 | + port = (httpServer.address() as AddressInfo).port; |
| 217 | + }); |
| 218 | + |
| 219 | + afterAll(async () => { |
| 220 | + WsBridge.getAll().clear(); |
| 221 | + await new Promise<void>((resolve, reject) => { |
| 222 | + wss.close((err) => (err ? reject(err) : resolve())); |
| 223 | + }); |
| 224 | + await new Promise<void>((resolve, reject) => { |
| 225 | + httpServer.close((err) => (err ? reject(err) : resolve())); |
| 226 | + }); |
| 227 | + }); |
| 228 | + |
| 229 | + /** Generous polling timeout. CI hosts add tens of ms of jitter on top |
| 230 | + * of the deferred-DB latency; 1 s is comfortably above any realistic |
| 231 | + * successful auth round-trip while still giving a fast failure when |
| 232 | + * the bug has actually re-introduced 4001-close behaviour. */ |
| 233 | + const OBSERVE_MS = 1_000; |
| 234 | + |
| 235 | + it('single back-to-back auth + daemon.hello stays open and authenticates', async () => { |
| 236 | + deferredDb.setLatency(0); |
| 237 | + const serverId = newServerId(); |
| 238 | + const url = `ws://127.0.0.1:${port}/api/server/${serverId}/ws`; |
| 239 | + const outcome = await driveDaemonHandshake(url, serverId, TOKEN, OBSERVE_MS, () => WsBridge.get(serverId).isAuthenticated); |
| 240 | + |
| 241 | + // Pre-fix: outcome.closeCode === 4001 ('auth_required') because |
| 242 | + // daemon.hello raced the auth's DB lookup. Post-fix: connection |
| 243 | + // survives and authenticates cleanly. |
| 244 | + expect(outcome.closeCode).not.toBe(4001); |
| 245 | + expect(outcome.authenticatedDuringWindow).toBe(true); |
| 246 | + }); |
| 247 | + |
| 248 | + it('survives a 50ms-DB-latency window without 4001-close', async () => { |
| 249 | + // 50 ms of DB latency is the worst-case race window: definitely long |
| 250 | + // enough that BOTH messages are queued in the message handler before |
| 251 | + // auth's DB lookup resolves. Without the `authPromise` serialization |
| 252 | + // this fails 100% of the time (4001 close). |
| 253 | + deferredDb.setLatency(50); |
| 254 | + const serverId = newServerId(); |
| 255 | + const url = `ws://127.0.0.1:${port}/api/server/${serverId}/ws`; |
| 256 | + const outcome = await driveDaemonHandshake(url, serverId, TOKEN, OBSERVE_MS, () => WsBridge.get(serverId).isAuthenticated); |
| 257 | + |
| 258 | + expect(outcome.closeCode).not.toBe(4001); |
| 259 | + expect(outcome.authenticatedDuringWindow).toBe(true); |
| 260 | + }); |
| 261 | + |
| 262 | + it('burst of 10 back-to-back reconnect cycles all authenticate cleanly (server-restart simulation)', { timeout: 30_000 }, async () => { |
| 263 | + // Simulates the production reconnect cascade after a server restart. |
| 264 | + // Each cycle: open → auth + daemon.hello → close. The race must be |
| 265 | + // closed for every single cycle, not just statistically most. |
| 266 | + deferredDb.setLatency(20); |
| 267 | + const cycles: ConnectionOutcome[] = []; |
| 268 | + for (let i = 0; i < 10; i += 1) { |
| 269 | + // Per-cycle unique serverId so stale-bridge close handlers from |
| 270 | + // the previous cycle can't evict the current cycle's bridge from |
| 271 | + // the shared map (see comment on `newServerId`). |
| 272 | + const serverId = newServerId(); |
| 273 | + const url = `ws://127.0.0.1:${port}/api/server/${serverId}/ws`; |
| 274 | + const outcome = await driveDaemonHandshake( |
| 275 | + url, |
| 276 | + serverId, |
| 277 | + TOKEN, |
| 278 | + OBSERVE_MS, |
| 279 | + () => WsBridge.get(serverId).isAuthenticated, |
| 280 | + ); |
| 281 | + cycles.push(outcome); |
| 282 | + } |
| 283 | + |
| 284 | + // Every cycle MUST avoid 4001. Counting failures gives a clearer |
| 285 | + // diagnostic than a single .toBe assertion when a flake creeps in. |
| 286 | + const flapped = cycles.filter((c) => c.closeCode === 4001); |
| 287 | + expect(flapped, `expected 0 cycles to 4001-close, got ${flapped.length} of 10`).toHaveLength(0); |
| 288 | + const failedAuth = cycles.filter((c) => !c.authenticatedDuringWindow); |
| 289 | + const diagnostic = JSON.stringify(cycles.map((c) => ({ |
| 290 | + closed: c.closed, closeCode: c.closeCode, closeReason: c.closeReason, |
| 291 | + auth: c.authenticatedDuringWindow, received: c.received.length, |
| 292 | + })), null, 2); |
| 293 | + expect(failedAuth, `expected 10 cycles to authenticate, got ${10 - failedAuth.length} of 10. cycles=${diagnostic}`).toHaveLength(0); |
| 294 | + }); |
| 295 | +}); |
0 commit comments