From 18df606127e9ccc4fd338f18d31714d63a17255e Mon Sep 17 00:00:00 2001 From: James Date: Mon, 18 May 2026 21:25:46 +0000 Subject: [PATCH 1/2] chore(engine): structured logging for sub-composition timeline poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures host IDs, timeline IDs before/after the poll, the diff set, and whether __hfForceTimelineRebind fired — emitted as a single JSON log line per render. Lets us correlate flaky regression runs (style-7-prod, gsap-letters-render-compat, style-3-prod) with whether the count-based rebind heuristic fired. Behavior is unchanged: the rebind condition `addedDuringPoll.length > 0` is equivalent to the prior `timelinesAfterPoll > timelinesBeforePoll` under the invariant that timeline IDs are never removed mid-poll. Intended to be reverted once the race condition is confirmed and patched. --- packages/engine/src/services/frameCapture.ts | 40 ++++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/packages/engine/src/services/frameCapture.ts b/packages/engine/src/services/frameCapture.ts index 62b3729e7..8109489a0 100644 --- a/packages/engine/src/services/frameCapture.ts +++ b/packages/engine/src/services/frameCapture.ts @@ -365,20 +365,46 @@ async function pollSubCompositionTimelines( } return true; })()`; - const timelinesBeforePoll = Number( - await page.evaluate(`Object.keys(window.__timelines || {}).length`), - ); + // Observability snapshot: capture host IDs + timeline IDs before and after + // the poll so flaky CI runs can be correlated with whether the count-based + // rebind heuristic fired. Temporary — drop once the race condition behind + // the regression flakes is confirmed and patched. + const beforeSnapshot = (await page.evaluate(`(function() { + var hosts = document.querySelectorAll("[data-composition-id]"); + var hostIds = []; + for (var i = 0; i < hosts.length; i++) { + var id = hosts[i].getAttribute("data-composition-id"); + if (id) hostIds.push(id); + } + return { hostIds: hostIds, timelineIds: Object.keys(window.__timelines || {}) }; + })()`)) as { hostIds: string[]; timelineIds: string[] }; + const pollStart = Date.now(); const ready = await pollPageExpression(page, expression, timeoutMs, intervalMs); - const timelinesAfterPoll = Number( - await page.evaluate(`Object.keys(window.__timelines || {}).length`), - ); - if (ready && timelinesAfterPoll > timelinesBeforePoll) { + const pollMs = Date.now() - pollStart; + const afterTimelineIds = (await page.evaluate( + `Object.keys(window.__timelines || {})`, + )) as string[]; + const beforeSet = new Set(beforeSnapshot.timelineIds); + const addedDuringPoll = afterTimelineIds.filter((id) => !beforeSet.has(id)); + const rebindFired = ready && addedDuringPoll.length > 0; + if (rebindFired) { await page.evaluate(`(function() { if (typeof window.__hfForceTimelineRebind === "function") { window.__hfForceTimelineRebind(); } })()`); } + console.log( + `[FrameCapture] pollSubCompositionTimelines ${JSON.stringify({ + hostIds: beforeSnapshot.hostIds, + timelineIdsBefore: beforeSnapshot.timelineIds, + timelineIdsAfter: afterTimelineIds, + addedDuringPoll, + pollMs, + ready, + rebindFired, + })}`, + ); if (!ready) { const missing = await page.evaluate(`(function() { var hosts = document.querySelectorAll("[data-composition-id]"); From ddc6c2e16ee88b3180beec01f59db4d24a1a7437 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 18 May 2026 21:27:04 +0000 Subject: [PATCH 2/2] ci: disable fail-fast in regression matrix for this investigation Lets every shard run to completion and emit its pollSubCompositionTimelines JSON log line, so we can correlate rebindFired across all shards on a single run instead of only seeing data from whichever shard happened to fail first. Restore fail-fast: true before merging or reverting this PR. --- .github/workflows/regression.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 62e2bc503..b33f4c477 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -50,7 +50,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 60 strategy: - fail-fast: true + # Temporarily disabled for the regression-flake investigation on + # chore/regression-poll-observability — we need every shard to emit + # its pollSubCompositionTimelines log even when an early shard fails. + # Restore to `true` before merging this PR (or before reverting it). + fail-fast: false matrix: # Shards are bin-packed by measured per-test duration (LPT heuristic on # CI run 25893372795) so each row carries ~15-16 min of work. When a