From 0ff03f7bd2dcb9bd84df754ed343cf0d7a8e1779 Mon Sep 17 00:00:00 2001
From: Roman PASSLER <roman.passler@gmail.com>
Date: Tue, 12 May 2026 20:40:24 +0200
Subject: [PATCH] ci: tighten job timeouts + add step-level limits for
 timed_out visibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cuts massively-over-budget job-timeouts down to honest worst-case +
margin, and pairs each long-running step with its own step-timeout so a
hang surfaces as `timed_out` on that step instead of as an opaque
job-level `cancelled` after the full budget elapses.

- ci.yml `test`: 30 → 25 min job; new 18 min step-timeout on Swift tests
  (p50 5–7 min, ~3× margin)
- e2e.yml `e2e`: 60 → 30 min job; new 25 min step-timeout on Run E2E
  tests (cold cache covers multi-GB model downloads)
- e2e-app.yml `e2e-app`: 15 → 20 min job (the three live-recording
  lanes now total ~14 min observed); new 10 min step-timeout on each
  lane so we can tell which one hung

Background: docs/plans/.local/open/2026-05-11-job-timeout-visibility.md
documents the cancelled-vs-timed_out distinction. Job-level timeouts map
to `cancelled` (same as user-cancel and concurrency-cancel) — only step-
level timeouts produce `timed_out`. Without step-timeouts, a hung test
takes ~30 min to fail AND looks identical to a manual cancel in the UI.
---
 .github/workflows/ci.yml      | 13 +++++++++++--
 .github/workflows/e2e-app.yml | 10 +++++++++-
 .github/workflows/e2e.yml     | 11 +++++++----
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 97f783fa..9cea9a4a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -94,7 +94,12 @@ jobs:
     needs: changes
     if: needs.changes.outputs.code == 'true'
     runs-on: macos-26
-    timeout-minutes: 30
+    # 25 min job-timeout = 20 min setup (cold cache) + ~5 min margin. The
+    # `Swift tests` step below has its own 18 min step-timeout so a hang
+    # surfaces as `timed_out` (clearly distinguishable from cancellation),
+    # not the opaque `cancelled` a bare job-timeout produces. See
+    # docs/plans/.local/open/2026-05-11-job-timeout-visibility.md.
+    timeout-minutes: 25
     permissions:
       contents: read
       actions: write
@@ -113,14 +118,18 @@ jobs:
     steps:
       - uses: actions/checkout@v6
       # 20 min covers SPM + ML cache restore plus the cold-cache pre-warm
-      # download; well below the 30 min job timeout so a hung download
+      # download; well below the 25 min job timeout so a hung download
       # surfaces as a step failure, not a job-timeout.
       - uses: ./.github/actions/setup-swift-test
         timeout-minutes: 20
         with:
           cache-key-suffix: ${{ matrix.variant }}
           swift-flags: ${{ matrix.swift-flags }}
+      # 18 min step-timeout on the actual tests: typical hot-cache run is
+      # 5–7 min, so this is ~3× margin while still failing fast as
+      # `timed_out` (not `cancelled`) when a single test hangs.
       - name: Swift tests
+        timeout-minutes: 18
         run: cd app/MeetingTranscriber && swift test --parallel --skip ModelPreloadTests ${{ matrix.swift-flags }}
       - name: Cancel run on failure
         if: failure()
diff --git a/.github/workflows/e2e-app.yml b/.github/workflows/e2e-app.yml
index 7bd4b7e2..0b12925b 100644
--- a/.github/workflows/e2e-app.yml
+++ b/.github/workflows/e2e-app.yml
@@ -33,7 +33,12 @@ jobs:
     # racing for the audio stack — see runner labels via
     # `gh api /repos/{owner}/{repo}/actions/runners`.
     runs-on: [self-hosted, macOS, ARM64, audio]
-    timeout-minutes: 15
+    # Three lanes (transcript, record-only, reimport) at ~4–6 min each
+    # → ~14 min observed total. 20 min job budget gives honest margin;
+    # each lane has its own 10 min step-timeout below so a hang in lane
+    # N surfaces as `timed_out` on that step (not as an opaque job-level
+    # `cancelled`). See docs/plans/.local/open/2026-05-11-job-timeout-visibility.md.
+    timeout-minutes: 20
 
     steps:
       - uses: actions/checkout@v6
@@ -69,6 +74,7 @@ jobs:
       - name: Run live-recording E2E driver
         env:
           DEVELOPER_ID: ${{ secrets.DEVELOPER_ID }}
+        timeout-minutes: 10
         # `--two-meetings` always: meeting 1 covers fresh-state single-run,
         # meeting 2 covers cooldown + state reset. ~3 min total wall on Mini
         # vs ~1 min for single-meeting; signal latency is fine on nightly +
@@ -86,6 +92,7 @@ jobs:
       - name: Run live-recording E2E driver (record-only)
         env:
           DEVELOPER_ID: ${{ secrets.DEVELOPER_ID }}
+        timeout-minutes: 10
         run: bash scripts/e2e-app.sh --no-build --record-only
 
       # Re-import lane: chains a record-only meeting with a POST
@@ -101,6 +108,7 @@ jobs:
       - name: Run live-recording E2E driver (re-import recorded WAV)
         env:
           DEVELOPER_ID: ${{ secrets.DEVELOPER_ID }}
+        timeout-minutes: 10
         run: bash scripts/e2e-app.sh --no-build --reimport-recorded
 
       # Best-effort: capture the simulator's stdout for post-mortem if
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index 73b56ed7..8cacc1a0 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -38,10 +38,12 @@ concurrency:
 jobs:
   e2e:
     runs-on: [self-hosted, macOS, ARM64]
-    # 60 min headroom: cold runs download three multi-GB models
-    # (WhisperKit ~1 GB, Parakeet ~50 MB, Qwen3 ~1.75 GB). Subsequent
-    # runs hit the SPM cache + on-disk model cache and finish in <10 min.
-    timeout-minutes: 60
+    # Cold-cache runs download three multi-GB models (WhisperKit ~1 GB,
+    # Parakeet ~50 MB, Qwen3 ~1.75 GB); hot runs <10 min. 30 min job
+    # budget covers worst-case cold download. The `Run E2E tests` step
+    # below has a 25 min step-timeout so a hang surfaces as `timed_out`
+    # — see docs/plans/.local/open/2026-05-11-job-timeout-visibility.md.
+    timeout-minutes: 30
     env:
       E2E_ENABLED: "1"
 
@@ -69,6 +71,7 @@ jobs:
       # via HTTPS on hit and waste the multi-GB upload on miss — the
       # latter timed out the post-action on a real run (5.6 GB → GitHub).
       - name: Run E2E tests
+        timeout-minutes: 25
         run: |
           cd app/MeetingTranscriber
           swift test --filter '${{ steps.filter.outputs.filter }}'