From a31df823890f004b1220d4d86158cff0f2b76d18 Mon Sep 17 00:00:00 2001 From: Terence Cho Date: Sat, 16 May 2026 12:09:41 -0700 Subject: [PATCH 1/2] fix(engine): treat ffmpegStreamingTimeout as per-frame inactivity, not total render time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The streaming encoder's safety timer was set once at spawn and fired SIGTERM unconditionally at `ffmpegStreamingTimeout` ms — turning a "FFmpeg is hung" guard into a hard cap on total render duration. Slow-but-progressing captures (CI runner under load, large compositions, slower compositor paths after #838's always-clip change) regularly exceeded the 600s default and were killed mid-encode. The symptom surfaced as: Streaming encode failed: FFmpeg exited with code 255 video:NNNkB audio:0kB ... Exiting normally, received signal 15. libx264 had encoded most frames cleanly; SIGTERM arrived during the encode, libx264 printed its end-of-encode stats, and Node observed a non-zero exit. The `audio:0kB` in stderr is incidental — the streaming encoder is video-only; audio is muxed later in `assembleStage`. Convert the timer to a heartbeat: every successful `writeFrame` resets it. Reset on `accepted === false` as well — backpressure means FFmpeg is consuming slower than we're producing, but data is still flowing into stdin. Only true hangs (Chrome dead, capture loop stuck, no frames at all for the timeout window) now trip SIGTERM. Total wall-clock render time is unbounded, which matches operator intent: a renderer that's making progress should be allowed to finish. Renames are intentionally avoided — `ffmpegStreamingTimeout` keeps its name and `600_000` default; only the semantics changed. The config doc spells out the new behavior so downstream consumers know what 600s now means. Added unit test: - 9× writeFrame at 900ms intervals (under the 1000ms threshold) — encoder stays alive through 8.1s of "slow but progressing" capture - Stall past the threshold — SIGTERM fires --- packages/engine/src/config.ts | 7 ++- .../src/services/streamingEncoder.test.ts | 33 ++++++++++++++ .../engine/src/services/streamingEncoder.ts | 43 ++++++++++++++----- 3 files changed, 71 insertions(+), 12 deletions(-) diff --git a/packages/engine/src/config.ts b/packages/engine/src/config.ts index e855af8d7..ab0871ad2 100644 --- a/packages/engine/src/config.ts +++ b/packages/engine/src/config.ts @@ -94,7 +94,12 @@ export interface EngineConfig { ffmpegEncodeTimeout: number; /** Timeout for FFmpeg mux/faststart processes (ms). Default: 300_000 */ ffmpegProcessTimeout: number; - /** Timeout for FFmpeg streaming encode (ms). Default: 600_000 */ + /** + * Inactivity timeout for FFmpeg streaming encode (ms). The timer resets on + * every successful `writeFrame` call, so this caps the duration of a + * single "no frame arrived" gap (capture hang, dead Chrome), not the total + * render time. Default: 600_000 (10 minutes without any frame = dead). + */ ffmpegStreamingTimeout: number; // ── HDR ────────────────────────────────────────────────────────────── diff --git a/packages/engine/src/services/streamingEncoder.test.ts b/packages/engine/src/services/streamingEncoder.test.ts index cfcb6774c..c7170e643 100644 --- a/packages/engine/src/services/streamingEncoder.test.ts +++ b/packages/engine/src/services/streamingEncoder.test.ts @@ -572,4 +572,37 @@ describe("spawnStreamingEncoder lifecycle and cleanup", () => { controller.abort(); expect(proc.kill).not.toHaveBeenCalled(); }); + + it("inactivity timeout fires only after a no-frame gap exceeds ffmpegStreamingTimeout", async () => { + vi.useFakeTimers(); + try { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { spawnStreamingEncoder } = await import("./streamingEncoder.js"); + const dir = mkdtempSync(join(tmpdir(), "se-heartbeat-")); + const encoder = await spawnStreamingEncoder(join(dir, "out.mp4"), baseOptions, undefined, { + ffmpegStreamingTimeout: 1000, + }); + + const proc = calls[0]!.proc; + + // Frames every 900ms — under the 1000ms inactivity threshold — should + // keep resetting the timer. After 9× 900ms = 8.1s of "slow but + // progressing" capture the encoder must still be alive. The old total- + // render timeout would have fired SIGTERM at ~1000ms. + for (let i = 0; i < 9; i++) { + encoder.writeFrame(Buffer.from([i])); + vi.advanceTimersByTime(900); + } + expect(proc.kill).not.toHaveBeenCalled(); + + // Now stall — no writeFrame for longer than the threshold. SIGTERM fires. + vi.advanceTimersByTime(1100); + expect(proc.kill).toHaveBeenCalledWith("SIGTERM"); + } finally { + vi.useRealTimers(); + } + }); }); diff --git a/packages/engine/src/services/streamingEncoder.ts b/packages/engine/src/services/streamingEncoder.ts index a31edeb01..279890698 100644 --- a/packages/engine/src/services/streamingEncoder.ts +++ b/packages/engine/src/services/streamingEncoder.ts @@ -416,13 +416,26 @@ export async function spawnStreamingEncoder( } } - // Timeout safety + // Inactivity timeout: fires only when no frame has been written for + // `ffmpegStreamingTimeout` ms. A slow-but-progressing capture (e.g. a CI + // runner under load) keeps resetting the timer on each writeFrame, so total + // wall-clock render time is unbounded — only a true hang (Chrome dead, + // capture stuck, no frames arriving) trips SIGTERM. The 600s default was + // previously a total-render cap, which intermittently killed legitimate + // slow renders mid-encode (FFmpeg got SIGTERM after most frames were sent; + // libx264 printed its summary and exited 255, observable as + // "Streaming encode failed: FFmpeg exited with code 255" with audio:0kB). const streamingTimeout = config?.ffmpegStreamingTimeout ?? DEFAULT_CONFIG.ffmpegStreamingTimeout; - const timer = setTimeout(() => { - if (exitStatus === "running") { - ffmpeg.kill("SIGTERM"); - } - }, streamingTimeout); + let timer: NodeJS.Timeout | null = null; + const resetTimer = () => { + if (timer) clearTimeout(timer); + timer = setTimeout(() => { + if (exitStatus === "running") { + ffmpeg.kill("SIGTERM"); + } + }, streamingTimeout); + }; + resetTimer(); const encoder: StreamingEncoder = { writeFrame: (buffer: Buffer): boolean => { @@ -433,11 +446,16 @@ export async function spawnStreamingEncoder( // provided buffer and drain it asynchronously. The HDR path's compositor // reuses pre-allocated transOutput/normalCanvas buffers across frames, // so without this copy the pipe would read partially-overwritten data - // and flicker. The SDR path doesn't invoke writeFrame at all (it pipes - // PNG files via encodeFramesFromDir), so the memcpy here is HDR-only - // and justified by correctness. + // and flicker. const copy = Buffer.from(buffer); - return ffmpeg.stdin.write(copy); + const accepted = ffmpeg.stdin.write(copy); + // Reset inactivity timer: each successful frame write is a sign of + // forward progress. We reset even on `accepted === false` (backpressure + // means FFmpeg is consuming slower than we're producing, but data is + // still flowing into its stdin buffer) so that a sustained slow-consumer + // scenario doesn't get killed mid-encode. + resetTimer(); + return accepted; }, close: async (): Promise => { @@ -455,7 +473,10 @@ export async function spawnStreamingEncoder( // repeated calls. If you change this method, preserve idempotency or // a regression here will silently double-close ffmpeg and produce // harder-to-trace errors at the orchestrator layer. - clearTimeout(timer); + if (timer) { + clearTimeout(timer); + timer = null; + } if (signal) signal.removeEventListener("abort", onAbort); const stdin = ffmpeg.stdin; From 111be1e33a0247b1fed7aae712ec9e87d435f764 Mon Sep 17 00:00:00 2001 From: Terence Cho Date: Sat, 16 May 2026 12:17:42 -0700 Subject: [PATCH 2/2] fix(engine): only reset streaming heartbeat on accepted writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review on #901: resetting the heartbeat on every writeFrame call — including ones that returned false — re-introduces a worse failure mode than the bug it fixed. If FFmpeg hangs while Chrome keeps capturing, every writeFrame returns false (Node buffers because the kernel pipe isn't being drained), the heartbeat resets anyway, and Node's writable stream grows without bound until OOM. The capture loop doesn't `await drain`, so there's no backpressure path to throttle Chrome. Tighten the reset to `accepted === true` only. A `true` return means the write went straight to the kernel pipe — proof FFmpeg actually consumed something. In steady state with a slower-but-alive FFmpeg, writes alternate between true and false as the buffer drains and refills; the `true`s are enough to keep the heartbeat ticking. A truly hung FFmpeg yields only `false`s, so the timer eventually fires and SIGTERMs the process. Test: simulate stalled FFmpeg by overriding stdin.write to return false on every call, pump 9 frames at 900ms intervals (would have kept the prior implementation alive indefinitely), and assert SIGTERM fires inside the 1000ms timeout window. The earlier "slow-but-progressing" test (writes return true) continues to pass. --- .../src/services/streamingEncoder.test.ts | 34 +++++++++++++++++++ .../engine/src/services/streamingEncoder.ts | 16 +++++---- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/packages/engine/src/services/streamingEncoder.test.ts b/packages/engine/src/services/streamingEncoder.test.ts index c7170e643..70fd95ea5 100644 --- a/packages/engine/src/services/streamingEncoder.test.ts +++ b/packages/engine/src/services/streamingEncoder.test.ts @@ -605,4 +605,38 @@ describe("spawnStreamingEncoder lifecycle and cleanup", () => { vi.useRealTimers(); } }); + + it("inactivity timeout still fires when stdin is backpressured (stalled ffmpeg, live producer)", async () => { + vi.useFakeTimers(); + try { + // Simulate the FFmpeg-hangs-but-Chrome-keeps-producing case: stdin.write + // always returns false (Node has to buffer because ffmpeg isn't draining + // the pipe). The heartbeat must NOT reset on those buffered writes — + // otherwise a hung ffmpeg with a steady frame producer would never + // SIGTERM and we'd grow Node's stdin buffer until OOM. + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { spawnStreamingEncoder } = await import("./streamingEncoder.js"); + const dir = mkdtempSync(join(tmpdir(), "se-backpressure-")); + const encoder = await spawnStreamingEncoder(join(dir, "out.mp4"), baseOptions, undefined, { + ffmpegStreamingTimeout: 1000, + }); + + const proc = calls[0]!.proc; + proc.stdin.write = (_chunk: Buffer) => false; + + // Pump 9 frames at 900ms intervals — all returning false. The reset + // should NOT fire (every write was buffered, not accepted), so the + // 1000ms timer (last reset on spawn) elapses near the start. + for (let i = 0; i < 9; i++) { + encoder.writeFrame(Buffer.from([i])); + vi.advanceTimersByTime(900); + } + expect(proc.kill).toHaveBeenCalledWith("SIGTERM"); + } finally { + vi.useRealTimers(); + } + }); }); diff --git a/packages/engine/src/services/streamingEncoder.ts b/packages/engine/src/services/streamingEncoder.ts index 279890698..9f2bcf81e 100644 --- a/packages/engine/src/services/streamingEncoder.ts +++ b/packages/engine/src/services/streamingEncoder.ts @@ -449,12 +449,16 @@ export async function spawnStreamingEncoder( // and flicker. const copy = Buffer.from(buffer); const accepted = ffmpeg.stdin.write(copy); - // Reset inactivity timer: each successful frame write is a sign of - // forward progress. We reset even on `accepted === false` (backpressure - // means FFmpeg is consuming slower than we're producing, but data is - // still flowing into its stdin buffer) so that a sustained slow-consumer - // scenario doesn't get killed mid-encode. - resetTimer(); + // Reset inactivity timer ONLY on `accepted === true`. `true` means the + // write went through to the kernel pipe without buffering in Node — + // proof FFmpeg is actually consuming. `false` means Node's writable + // stream had to buffer (FFmpeg hasn't drained the pipe yet); we deliberately + // don't reset on `false` so a hung FFmpeg with a still-producing Chrome + // can't keep us alive forever while Node's stdin buffer grows to OOM. In + // steady state with a slower-but-alive FFmpeg, writes alternate between + // true and false as the buffer drains and refills; the trues are enough + // to keep the heartbeat ticking. + if (accepted) resetTimer(); return accepted; },