gh-aw-test/.github/workflows/e2e.yml at main · githubnext/gh-aw-test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
name: e2e

# Compile-compatibility matrix that runs the E2E suite against multiple gh-aw
# refs × samples modes. The scheduled (nightly) run exercises the full matrix:
#   1. github/gh-aw `main`              (source mode)
#   2. the latest gh-aw pre-release tag (extension mode)
#   3. the latest gh-aw stable release  (extension mode)
# each crossed with samples=false (live engine) and samples=true (deterministic).
#
# Execution model
# ---------------
# * SERIAL: the matrix runs with `max-parallel: 1`. Each entry itself dispatches
#   dozens of test workflows, so running entries concurrently would flood GitHub
#   Actions. Entries are therefore queued one at a time (`fail-fast: false` so a
#   single entry's failure doesn't cancel the rest).
# * RUNS FROM MAIN: each entry recompiles its workflows, pushes them to `main`,
#   and dispatches every test from `main`. Event-triggered tests (add-comment,
#   add-labels, update-issue, etc.) create their own fixtures and wait for the
#   GitHub event to fire, exactly as they do when run locally.
#
# Manual dispatch
# ---------------
# `workflow_dispatch` exposes a `selection` choice so a human can run a single
# combination quickly instead of the whole matrix. The default is `quick`
# (main / source / samples=true). `full` runs the entire matrix serially, same
# as the nightly schedule.
#
# Because `$CI` is set to `true` by GitHub Actions, `e2e.sh` runs in CI mode: it
# does NOT mutate the repository's `TEMP_USER_PAT` secret (it relies on the
# pre-configured repo secret + the `GH_AW_TEST_PAT` env var instead).
#
# Required secrets:
#   * GH_AW_TEST_PAT      (required) — PAT used for all operations except copilot requests

on:
  schedule:
    - cron: '0 3 * * *'   # Nightly at 03:00 UTC
  workflow_dispatch:
    inputs:
      selection:
        description: 'Which part of the matrix to run'
        required: false
        type: choice
        default: quick
        options:
          - quick            # main / source / samples=true   (fast, deterministic)
          - main-live        # main / source / samples=true   (live engine path — samples=false temporarily disabled)
          - release          # latest release / extension / samples=true
          - prerelease       # latest pre-release / extension / samples=true
          - full             # full matrix, run serially
      gh_aw_ref:
        description: 'github/gh-aw branch, tag, or SHA to test in source mode (e.g. main, a feature branch, or a commit SHA). Applies to the quick / main-live selections; defaults to main.'
        required: false
        default: ''
      refs:
        description: 'Optional comma-separated list of gh-aw refs to test (overrides main / latest pre-release / latest release auto-detection). Only used when selection=full.'
        required: false
        default: ''

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: true

permissions:
  contents: read
  issues: write
  pull-requests: read
  actions: read

jobs:
  resolve-refs:
    name: Resolve gh-aw refs to test
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.resolve.outputs.matrix }}
      main_sha: ${{ steps.resolve.outputs.main_sha }}
      latest_release: ${{ steps.resolve.outputs.latest_release }}
      latest_prerelease: ${{ steps.resolve.outputs.latest_prerelease }}
    steps:
      - name: Resolve refs
        id: resolve
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
          OVERRIDE: ${{ inputs.refs }}
          # Optional github/gh-aw branch/tag/SHA to test in source mode. Applies
          # to the single-entry quick / main-live selections; empty => main.
          GH_AW_REF_INPUT: ${{ inputs.gh_aw_ref }}
          # On `schedule` (and any non-dispatch event) there is no selection
          # input, so default to the full matrix. On workflow_dispatch this is
          # the user's chosen option (default `quick`).
          SELECTION: ${{ github.event_name == 'workflow_dispatch' && inputs.selection || 'full' }}
        run: |
          set -euo pipefail

          main_sha=$(gh api repos/github/gh-aw/commits/main --jq '.sha' | cut -c1-12)
          latest_release=$(gh release list --repo github/gh-aw --limit 50 \
              --json tagName,isPrerelease,isDraft \
              --jq '[.[] | select(.isPrerelease==false and .isDraft==false)][0].tagName' || echo "")
          latest_prerelease=$(gh release list --repo github/gh-aw --limit 50 \
              --json tagName,isPrerelease,isDraft \
              --jq '[.[] | select(.isPrerelease==true and .isDraft==false)][0].tagName' || echo "")

          if [[ -z "$latest_release" ]]; then
            echo "::error::Could not resolve latest gh-aw release"
            exit 1
          fi
          if [[ -z "$latest_prerelease" ]]; then
            latest_prerelease="$latest_release"
          fi

          echo "Selection: $SELECTION"

          # The single-entry source-mode selections (quick / main-live) test a
          # specific github/gh-aw ref. Default to `main`, but allow the dispatch
          # input `gh_aw_ref` to point them at any branch, tag, or SHA. The label
          # is slugified from the ref so job titles / branches stay readable.
          src_ref="${GH_AW_REF_INPUT:-main}"
          [[ -z "$src_ref" ]] && src_ref="main"
          src_label=$(echo "$src_ref" | tr '/' '-' | tr -cd 'A-Za-z0-9._-')
          echo "Source-mode ref: $src_ref (label: $src_label)"

          # Each matrix entry is a single, fully-specified combination:
          #   label   : short human-readable name used in job titles / report
          #   ref     : git ref / tag passed to the build or install step
          #   mode    : 'source'    => check out github/gh-aw at <ref>, `make build`,
          #                           then run e2e.sh --gh-aw-ref <ref>.
          #             'extension' => `gh extension install github/gh-aw --pin <ref>`,
          #                           then run e2e.sh with NO --gh-aw-ref.
          #   samples : true  => run with --use-samples (deterministic, no engine)
          #             false => run against the live AI engine
          #
          # The job that consumes this matrix runs serially (max-parallel: 1) and
          # each entry pushes to its own dedicated branch, so entries never race
          # each other or main.
          #
          # `full` builds the complete (ref × samples) product. The single-entry
          # selections pick exactly one combination for a fast manual dispatch.
          case "$SELECTION" in
            quick)
              matrix=$(jq -nc --arg ref "$src_ref" --arg label "$src_label" \
                '[{ label: $label, ref: $ref, mode: "source", samples: true }]')
              ;;
            main-live)
              # samples=false temporarily disabled — COPILOT_GITHUB_TOKEN needs rotation.
              # Change `samples: false` back to re-enable live-engine runs.
              matrix=$(jq -nc --arg ref "$src_ref" --arg label "$src_label" \
                '[{ label: $label, ref: $ref, mode: "source", samples: true }]')
              ;;
            release)
              matrix=$(jq -nc --arg rel "$latest_release" \
                '[{ label: "release", ref: $rel, mode: "extension", samples: true }]')
              ;;
            prerelease)
              matrix=$(jq -nc --arg pre "$latest_prerelease" \
                '[{ label: "pre-release", ref: $pre, mode: "extension", samples: true }]')
              ;;
            full|*)
              # Override format (workflow_dispatch input `refs`): comma-separated
              # tokens. Each token may be a bare ref (mode=source) or
              # "<ref>:source" / "<ref>:extension" to force a mode.
              if [[ -n "$OVERRIDE" ]]; then
                base=$(echo "$OVERRIDE" | jq -Rc '
                  split(",")
                  | map(gsub("^\\s+|\\s+$";""))
                  | map(select(length>0))
                  | map(
                      if test(":") then
                        (split(":") | { label: .[0], ref: .[0], mode: (.[1] // "source") })
                      else
                        { label: ., ref: ., mode: "source" }
                      end
                    )')
              else
                base=$(jq -nc \
                  --arg pre "$latest_prerelease" \
                  --arg rel "$latest_release" \
                  '[
                    { label: "main",           ref: "main", mode: "source" },
                    { label: "pre-release",    ref: $pre,  mode: "extension" },
                    { label: "release",        ref: $rel,  mode: "extension" }
                  ]
                  # de-dupe by ref so pre-release==release does not run twice
                  | unique_by(.ref)')
              fi

              # # Expand each base entry across samples modes:
              # #   main        → samples=false AND samples=true (inference + deterministic)
              # #   pre-release/release → samples=true only (skip inference runs for speed)
              # matrix=$(echo "$base" | jq -c '[ .[] as $e | if $e.label == "main" then (false, true) else true end | $e + { samples: . } ]')

              # Expand each base entry across samples modes, always true for now
              #   main        → samples=true (skip inference runs for stability for now)
              #   pre-release/release → samples=true only (skip inference runs for speed)
              matrix=$(echo "$base" | jq -c '[ .[] as $e | if $e.label == "main" then true else true end | $e + { samples: . } ]')
              ;;
          esac

          echo "Resolved:"
          echo "  main:               main ($main_sha)"
          echo "  latest pre-release: $latest_prerelease"
          echo "  latest release:     $latest_release"
          echo "  matrix:             $matrix"

          {
            echo "main_sha=$main_sha"
            echo "latest_release=$latest_release"
            echo "latest_prerelease=$latest_prerelease"
            echo "matrix=$matrix"
          } >> "$GITHUB_OUTPUT"

  e2e:
    name: "E2E: ${{ matrix.entry.label }} (gh-aw@${{ matrix.entry.ref }}, ${{ matrix.entry.mode }}, samples=${{ matrix.entry.samples }})"
    needs: resolve-refs
    runs-on: ubuntu-latest
    timeout-minutes: 90
    strategy:
      # Run entries one at a time so the suite never floods GitHub Actions with
      # concurrent dispatched test runs (each entry itself dispatches many
      # workflows). fail-fast is off so one entry's failure doesn't cancel the
      # rest of the (serial) queue.
      fail-fast: false
      max-parallel: 1
      matrix:
        entry: ${{ fromJSON(needs.resolve-refs.outputs.matrix) }}
    steps:
      - name: Checkout gh-aw-test
        uses: actions/checkout@v4
        with:
          # Always checkout the latest HEAD of main, not the trigger SHA.
          # When e2e.yml is triggered by a push, `github.sha` is the pushed
          # commit, but by the time the job runs (serial matrix, queued), a
          # prior matrix entry may have pushed a recompile commit on top of
          # it. Without `ref: main`, the checkout is behind origin/main and
          # `git push` fails with non-fast-forward.
          ref: main
          path: gh-aw-test
          fetch-depth: 0
          # Use the PAT so e2e.sh can push the recompiled workflows to main
          # (the default GITHUB_TOKEN only has contents:read in this workflow).
          token: ${{ secrets.GH_AW_TEST_PAT }}

      # --- source mode: build gh-aw from a checkout ---------------------------
      - name: Checkout github/gh-aw at ${{ matrix.entry.ref }} (source mode)
        if: matrix.entry.mode == 'source'
        uses: actions/checkout@v4
        with:
          repository: github/gh-aw
          ref: ${{ matrix.entry.ref }}
          path: gh-aw
          fetch-depth: 1

      - name: Set up Go (source mode)
        if: matrix.entry.mode == 'source'
        uses: actions/setup-go@v5
        with:
          go-version-file: gh-aw/go.mod
          cache-dependency-path: gh-aw/go.sum

      - name: Build gh-aw binary from source (source mode)
        if: matrix.entry.mode == 'source'
        working-directory: gh-aw
        run: make build

      # --- extension mode: install published gh extension at the pinned tag ---
      - name: Install published gh-aw extension at ${{ matrix.entry.ref }} (extension mode)
        if: matrix.entry.mode == 'extension'
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
        run: |
          set -euo pipefail
          # Remove any pre-existing install so --pin actually wins
          gh extension remove github/gh-aw 2>/dev/null || true
          # Retry up to 3 times with backoff. Prior matrix entries (especially
          # the 45-minute source-mode run) can exhaust the API rate limit,
          # causing a transient 403 on the release-asset download.
          for _attempt in 1 2 3; do
            if gh extension install github/gh-aw --pin "${{ matrix.entry.ref }}"; then
              break
            fi
            if [[ "$_attempt" -eq 3 ]]; then
              echo "::error::Failed to install gh-aw extension after 3 attempts"
              exit 1
            fi
            echo "Install attempt $_attempt failed; waiting 90s for rate limit to recover..."
            sleep 90
          done
          gh aw --version

      # --- shared steps -------------------------------------------------------
      - name: Set up Node.js
        uses: actions/setup-node@v5

      - name: gh auth status
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
        run: gh auth status

      - name: Run E2E against gh-aw@${{ matrix.entry.ref }} (${{ matrix.entry.mode }} mode)
        id: run
        working-directory: gh-aw-test
        timeout-minutes: 80
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
          GITHUB_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
          GH_AW_TEST_PAT: ${{ secrets.GH_AW_TEST_PAT }}
          E2E_REF: ${{ matrix.entry.ref }}
          E2E_MODE: ${{ matrix.entry.mode }}
          E2E_USE_SAMPLES: ${{ matrix.entry.samples }}
        run: |
          set +e
          chmod +x ./e2e.sh
          if [[ -z "${GH_AW_TEST_PAT:-}" ]]; then
            echo "::error::GH_AW_TEST_PAT secret is not set on this repository. The matrix runs e2e.sh in mode, which does not mutate repo secrets and therefore requires GH_AW_TEST_PAT to be supplied via secrets.GH_AW_TEST_PAT."
            exit 1
          fi

          # e2e.sh commits the recompiled workflows and pushes them to main;
          # CI runners have no default git identity.
          git config user.name  "gh-aw-test e2e bot"
          git config user.email "gh-aw-test-e2e@users.noreply.github.com"

          # Slug uniquely identifies this (ref × samples) combination and is used
          # for the per-entry report files.
          slug=$(echo "${E2E_REF}-samples-${E2E_USE_SAMPLES}" | tr '/' '_' | tr -cd 'A-Za-z0-9._-')

          # The matrix is serial (max-parallel: 1), so each entry can safely
          # recompile, push to main, and dispatch every test from main without
          # clobbering another entry. Running from main tests the common case
          # users actually experience (and keeps create-pull-request's
          # fetch-depth:1 merge-base against origin/main trivially resolvable).

          # Build flags
          SAMPLES_FLAG=""
          if [[ "$E2E_USE_SAMPLES" == "true" ]]; then
            SAMPLES_FLAG="--use-samples"
          fi

          # Tests to run in every matrix entry.
          # Copilot suite in full; one create-issue test for Claude and Codex.
          TESTS=('test-copilot-*' 'test-claude-create-issue' 'test-codex-create-issue')

          if [[ "$E2E_MODE" == "source" ]]; then
            # source mode: locally-built binary + --gh-aw-ref so lockfiles
            # reference github/gh-aw/actions/setup@<ref>.
            # ($CI=true is set automatically by GitHub Actions, so e2e.sh runs
            # in CI mode: no secret mutation. Recompiled workflows are pushed
            # to main.)
            # --verbose streams the gh-aw build/compile/version diagnostics to
            # the job log (otherwise they'd only land in the e2e-test-*.log
            # artifact, making build failures opaque in CI).
            ./e2e.sh --gh-aw-ref "$E2E_REF" --verbose $SAMPLES_FLAG "${TESTS[@]}" 2>&1 | tee e2e-output.log
          else
            # extension mode: rely on the installed `gh aw` (pinned to $E2E_REF),
            # no --gh-aw-ref override — lockfiles will reference the published
            # gh-aw-actions release that ships with that gh-aw version, which is
            # exactly what end users get.
            ./e2e.sh --verbose $SAMPLES_FLAG "${TESTS[@]}" 2>&1 | tee e2e-output.log
          fi
          rc=${PIPESTATUS[0]}
          echo "exit_code=$rc" >> "$GITHUB_OUTPUT"

          mkdir -p ../report
          if [[ -s fails.txt ]]; then
            cp fails.txt "../report/fails-${slug}.txt"
          else
            : > "../report/fails-${slug}.txt"
          fi
          if [[ -s passes.txt ]]; then
            cp passes.txt "../report/passes-${slug}.txt"
          else
            : > "../report/passes-${slug}.txt"
          fi
          printf '%s' "$E2E_REF"  > "../report/ref-${slug}.txt"
          printf '%s' "$E2E_MODE" > "../report/mode-${slug}.txt"
          printf '%s' "$rc"               > "../report/rc-${slug}.txt"

          # Don't fail the step here — let the report job decide overall pass/fail
          # so the aggregator always runs and the artifact upload happens.
          exit 0

      - name: Append job summary
        if: always()
        working-directory: gh-aw-test
        run: |
          {
            echo "## E2E: ${{ matrix.entry.label }} — gh-aw@${{ matrix.entry.ref }} (${{ matrix.entry.mode }} mode, samples=${{ matrix.entry.samples }})"
            echo
            echo "Exit code: \`${{ steps.run.outputs.exit_code }}\`"
            echo
            if [[ -s fails.txt ]]; then
              echo "### Failed tests"
              echo '```'
              cat fails.txt
              echo '```'
            else
              echo "_No failed tests recorded._"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Upload artifacts for ${{ matrix.entry.label }} (samples=${{ matrix.entry.samples }})
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: e2e-${{ matrix.entry.label }}-samples-${{ matrix.entry.samples }}
          path: |
            gh-aw-test/e2e-test-*.log
            gh-aw-test/e2e-output.log
            gh-aw-test/fails.txt
            gh-aw-test/passes.txt
            report/

  report:
    name: Publish status report issue
    needs: [resolve-refs, e2e]
    if: always()
    runs-on: ubuntu-latest
    permissions:
      issues: write
      contents: read
    steps:
      - name: Download all matrix artifacts
        uses: actions/download-artifact@v4
        with:
          path: artifacts

      - name: Build status report body
        env:
          MATRIX_JSON: ${{ needs.resolve-refs.outputs.matrix }}
          MAIN_SHA: ${{ needs.resolve-refs.outputs.main_sha }}
          LATEST_RELEASE: ${{ needs.resolve-refs.outputs.latest_release }}
          LATEST_PRERELEASE: ${{ needs.resolve-refs.outputs.latest_prerelease }}
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
          SERVER_URL: ${{ github.server_url }}
          REPO: ${{ github.repository }}
          E2E_RESULT: ${{ needs.e2e.result }}
        run: |
          set -euo pipefail

          # ---------------------------------------------------------------
          # Pass 1: gather per-entry data into TSV accumulators so we can
          # render an aggregate summary and a failures-first report.
          # ---------------------------------------------------------------
          entries_tsv=$(mktemp)     # label \t ref \t mode \t samples \t rc \t count \t status
          failed_tests_tsv=$(mktemp)  # test \t label \t ref \t mode \t samples \t run_url
          passed_tests_tsv=$(mktemp)  # test \t label \t ref \t mode \t samples \t run_url

          total_entries=0
          entries_passed=0
          entries_failed=0
          entries_unknown=0
          total_failed_tests=0
          overall_failed=0

          while IFS=$'\t' read -r label ref mode samples; do
            [[ -z "$label" ]] && continue
            total_entries=$((total_entries + 1))
            slug=$(echo "${ref}-samples-${samples}" | tr '/' '_' | tr -cd 'A-Za-z0-9._-')
            art="artifacts/e2e-${label}-samples-${samples}"
            rc_file="${art}/report/rc-${slug}.txt"
            fails_file="${art}/report/fails-${slug}.txt"

            if [[ -f "$rc_file" ]]; then
              rc=$(cat "$rc_file")
            else
              rc="?"
            fi

            count=0
            if [[ -s "$fails_file" ]]; then
              count=$(grep -c . "$fails_file" || true)
              # Collect each failed test + a link to its latest run.
              while IFS= read -r fline; do
                [[ -z "$fline" ]] && continue
                tname="${fline%% *}"
                run_url=""
                if [[ "$fline" == *" "* ]]; then
                  last_id="${fline##* }"
                  run_url="${SERVER_URL}/${REPO}/actions/runs/${last_id}"
                fi
                printf '%s\t%s\t%s\t%s\t%s\t%s\n' \
                  "$tname" "$label" "$ref" "$mode" "$samples" "$run_url" >> "$failed_tests_tsv"
              done < "$fails_file"
            fi

            # Collect each passed test + a link to its run.
            passes_file="${art}/report/passes-${slug}.txt"
            if [[ -s "$passes_file" ]]; then
              while IFS= read -r fline; do
                [[ -z "$fline" ]] && continue
                tname="${fline%% *}"
                run_url=""
                if [[ "$fline" == *" "* ]]; then
                  last_id="${fline##* }"
                  run_url="${SERVER_URL}/${REPO}/actions/runs/${last_id}"
                fi
                printf '%s\t%s\t%s\t%s\t%s\t%s\n' \
                  "$tname" "$label" "$ref" "$mode" "$samples" "$run_url" >> "$passed_tests_tsv"
              done < "$passes_file"
            fi

            # Determine entry status.
            if [[ "$rc" == "?" ]]; then
              status="⚪ unknown"
              entries_unknown=$((entries_unknown + 1))
              overall_failed=1
            elif [[ "$rc" == "0" && "$count" -eq 0 ]]; then
              status="✅ pass"
              entries_passed=$((entries_passed + 1))
            else
              status="❌ fail"
              entries_failed=$((entries_failed + 1))
              overall_failed=1
            fi

            total_failed_tests=$((total_failed_tests + count))
            printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
              "$label" "$ref" "$mode" "$samples" "$rc" "$count" "$status" >> "$entries_tsv"
          done < <(echo "$MATRIX_JSON" | jq -r '.[] | [.label, .ref, .mode, (.samples|tostring)] | @tsv')

          if [[ "$overall_failed" == "0" ]]; then
            headline="✅ All ${total_entries} matrix entr$([[ $total_entries -eq 1 ]] && echo y || echo ies) passed"
          else
            headline="❌ ${entries_failed} of ${total_entries} matrix entr$([[ $total_entries -eq 1 ]] && echo y || echo ies) failed (${total_failed_tests} failed test run(s))"
          fi

          # ---------------------------------------------------------------
          # Build lookup tables for the per-test matrix view.
          # ---------------------------------------------------------------
          declare -A _ref_for_label=()   # label -> ref
          declare -A _cell_status=()     # "samples:test:label" -> "pass" or "fail"
          declare -A _cell_url=()        # "samples:test:label" -> run URL
          while IFS=$'\t' read -r label ref mode samples rc count status; do
            [[ -z "$label" ]] && continue
            _ref_for_label["$label"]="$ref"
          done < "$entries_tsv"
          while IFS=$'\t' read -r tname label ref mode samples run_url; do
            [[ -z "$tname" ]] && continue
            _cell_status["${samples}:${tname}:${label}"]="fail"
            _cell_url["${samples}:${tname}:${label}"]="$run_url"
          done < <(sort -f "$failed_tests_tsv")
          while IFS=$'\t' read -r tname label ref mode samples run_url; do
            [[ -z "$tname" ]] && continue
            _cell_status["${samples}:${tname}:${label}"]="pass"
            _cell_url["${samples}:${tname}:${label}"]="$run_url"
          done < <(sort -f "$passed_tests_tsv")

          # Helper: format a ref value as a markdown link.
          _make_ref_link() {
            local _r="$1"
            if [[ "$_r" == "main" ]]; then
              echo "[main](${SERVER_URL}/github/gh-aw/commit/${MAIN_SHA})"
            else
              echo "[${_r}](${SERVER_URL}/github/gh-aw/releases/tag/${_r})"
            fi
          }

          # Render a test × ref matrix table.
          # Args: samples_value ("true"|"false"), type ("errors"|"successes"), heading
          _render_matrix_table() {
            local _sv="$1"    # "true" or "false"
            local _type="$2"  # "errors" or "successes"
            local _hdg="$3"
            # Ordered columns: release → pre-release → main, then any others.
            local _cols=()
            for _lbl in "release" "pre-release" "main"; do
              [[ -n "${_ref_for_label[$_lbl]:-}" ]] && _cols+=("$_lbl")
            done
            for _lbl in "${!_ref_for_label[@]}"; do
              case "$_lbl" in release|pre-release|main) ;; *)
                _cols+=("$_lbl") ;;
              esac
            done
            [[ "${#_cols[@]}" -eq 0 ]] && return 0
            local _tests
            if [[ "$_type" == "errors" ]]; then
              _tests=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$failed_tests_tsv" | sort -u)
            else
              local _fp _ff
              _fp=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$passed_tests_tsv" | sort -u)
              _ff=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$failed_tests_tsv" | sort -u)
              if [[ -z "$_fp" ]]; then
                _tests=""
              elif [[ -z "$_ff" ]]; then
                _tests="$_fp"
              else
                _tests=$(comm -23 <(echo "$_fp") <(echo "$_ff"))
              fi
            fi
            [[ -z "$_tests" ]] && return 0
            echo "## ${_hdg}"
            echo
            local _hdr="| test |"
            local _sep="| :--- |"
            for _lbl in "${_cols[@]}"; do
              local _ref="${_ref_for_label[$_lbl]}"
              local _rlink
              _rlink=$(_make_ref_link "$_ref")
              _hdr+=" ${_rlink} |"
              _sep+=" :---: |"
            done
            echo "$_hdr"
            echo "$_sep"
            while IFS= read -r _tname; do
              [[ -z "$_tname" ]] && continue
              local _test_url="${SERVER_URL}/${REPO}/blob/main/.github/workflows/${_tname}.md"
              local _row="| [${_tname}](${_test_url}) |"
              for _lbl in "${_cols[@]}"; do
                local _key="${_sv}:${_tname}:${_lbl}"
                local _st="${_cell_status[$_key]:-}"
                local _ru="${_cell_url[$_key]:-}"
                local _cell
                if [[ "$_st" == "pass" ]]; then
                  [[ -n "$_ru" ]] && _cell="[✅]($_ru)" || _cell="✅"
                elif [[ "$_st" == "fail" ]]; then
                  [[ -n "$_ru" ]] && _cell="[❌]($_ru)" || _cell="❌"
                else
                  _cell="—"
                fi
                _row+=" ${_cell} |"
              done
              echo "$_row"
            done <<< "$_tests"
            echo
          }

          # ---------------------------------------------------------------
          # Pass 2: render the markdown report.
          # ---------------------------------------------------------------
          {
            echo "**Run:** [$RUN_URL]($RUN_URL) &nbsp;·&nbsp; **Trigger:** \`${{ github.event_name }}\` &nbsp;·&nbsp; **Generated:** \`$(date -u +%FT%TZ)\` &nbsp;·&nbsp; **Outcome:** \`$E2E_RESULT\`"
            echo

            _render_matrix_table "true"  "errors"    "❌ Test errors — samples mode"
            _render_matrix_table "false" "errors"    "❌ Test errors — inference mode"
            _render_matrix_table "true"  "successes" "✅ Test successes — samples mode"
            _render_matrix_table "false" "successes" "✅ Test successes — inference mode"

            # When anything failed, append a ready-to-paste prompt for a coding
            # agent to triage the failures from the run logs.
            if [[ "$overall_failed" != "0" ]]; then
              echo "## 🤖 Agent triage prompt"
              echo
              echo "Copy the block below into a coding agent (e.g. Copilot) to investigate the failures:"
              echo
              echo '```text'
              echo "You are triaging failures from the gh-aw-test E2E suite."
              echo "Run: $RUN_URL"
              echo "Repository under test: github/gh-aw (the gh-aw CLI/compiler)."
              echo "Test harness repository: $GITHUB_REPOSITORY (this repo; runner is e2e.sh)."
              echo
              echo "Focus: gh-aw main (commit ${MAIN_SHA}), sampling = true (--use-samples; AI engine was NOT called)."
              echo
              # List every test that failed in the main/samples=true cell, with its run URL.
              main_samples_failures=$(awk -F'\t' '$2=="main" && $5=="true" {print "  - " $1 "  run: " $6}' "$failed_tests_tsv")
              if [[ -n "$main_samples_failures" ]]; then
                echo "Failed tests (main, samples=true):"
                echo "$main_samples_failures"
              else
                echo "No failures in the main/samples=true cell."
              fi
              # Also summarise failures in other cells if present.
              other_failures=$(awk -F'\t' '!($2=="main" && $5=="true") {print "  - " $1 "  [" $2 ", samples=" $5 "]  run: " $6}' "$failed_tests_tsv")
              echo
              echo "Goal: for EACH failed test above, access the GitHub"
              echo "Actions logs for the run above (and the per-entry artifacts e2e-<label>-samples-<bool>,"
              echo "which contain e2e-test-*.log, e2e-output.log and fails.txt), determine the root"
              echo "cause, and categorize the failure as exactly one of:"
              echo
              echo "  1. TRANSIENT — flaky/infra/network/rate-limit/timing; not a real defect."
              echo "     Action: note it and recommend a re-run (./e2e.sh rerun)."
              echo "  2. TEST-FRAMEWORK BUG — a defect in this repo's harness (e2e.sh), a workflow"
              echo "     source file (.github/workflows/test-*.md), a sample, or CI config."
              echo "     Action: propose a concrete fix (file + change) in $GITHUB_REPOSITORY."
              echo "  3. GH-AW BUG — a defect in github/gh-aw itself (compiler output, runtime"
              echo "     engine behaviour, safe-output handling, etc.)."
              echo "     Action: open an issue in github/gh-aw with a minimal repro, the failing"
              echo "     test name, the gh-aw ref/mode/samples combination, and links to the"
              echo "     relevant log lines. Check for an existing open issue first and link it"
              echo "     instead of filing a duplicate."
              echo
              echo "Steps:"
              echo "  - Use 'gh run view <run-id> --log' and 'gh run download <run-id>' to fetch logs/artifacts."
              echo "  - Read AGENTS.md in $GITHUB_REPOSITORY for harness conventions before proposing fixes."
              echo "  - Group failures by suspected root cause; the same gh-aw bug may explain several."
              echo "  - Produce a table: test | category | root cause | recommended action | issue/PR link."
              echo "  - Only open github/gh-aw issues for category 3, and only after confirming no duplicate exists."
              echo '```'
            fi

            echo "$overall_failed" > /tmp/overall_failed
          } > report.md

          rm -f "$entries_tsv" "$failed_tests_tsv" "$passed_tests_tsv"
          cat report.md

      - name: Ensure status-report label exists
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
        run: |
          gh label create e2e-status-report \
            --color BFD4F2 \
            --description "Automated cross-ref E2E status report" \
            --repo "$GITHUB_REPOSITORY" 2>/dev/null || true

      - name: Find prior open status reports
        id: prior
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
        run: |
          set -euo pipefail
          numbers=$(gh issue list \
            --repo "$GITHUB_REPOSITORY" \
            --label e2e-status-report \
            --state open \
            --limit 50 \
            --json number \
            --jq '[.[].number] | join(",")')
          echo "numbers=$numbers" >> "$GITHUB_OUTPUT"
          echo "Prior open status reports: ${numbers:-none}"

      - name: Compose final issue body (with links to previous reports)
        env:
          PRIOR: ${{ steps.prior.outputs.numbers }}
        run: |
          {
            cat report.md
            if [[ -n "$PRIOR" ]]; then
              echo
              echo "## Previous status reports (closed by this run)"
              IFS=',' read -ra arr <<< "$PRIOR"
              for n in "${arr[@]}"; do
                echo "* #$n"
              done
            fi
          } > issue-body.md

      - name: Create new status report issue
        id: create
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
        run: |
          set -euo pipefail
          title="E2E status report — $(date -u +%F) (gh-aw matrix)"
          url=$(gh issue create \
            --repo "$GITHUB_REPOSITORY" \
            --title "$title" \
            --label e2e-status-report \
            --body-file issue-body.md)
          number="${url##*/}"
          echo "Created: $url"
          echo "number=$number" >> "$GITHUB_OUTPUT"
          echo "url=$url"       >> "$GITHUB_OUTPUT"

      - name: Close previous status report issues
        if: steps.prior.outputs.numbers != ''
        env:
          GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
          PRIOR: ${{ steps.prior.outputs.numbers }}
          NEW_NUMBER: ${{ steps.create.outputs.number }}
          NEW_URL: ${{ steps.create.outputs.url }}
        run: |
          IFS=',' read -ra arr <<< "$PRIOR"
          for n in "${arr[@]}"; do
            gh issue comment "$n" \
              --repo "$GITHUB_REPOSITORY" \
              --body "Superseded by #${NEW_NUMBER} — ${NEW_URL}"
            gh issue close "$n" \
              --repo "$GITHUB_REPOSITORY" \
              --reason "not planned"
          done

      - name: Fail the workflow if any matrix entry reported failures
        run: |
          overall=$(cat /tmp/overall_failed 2>/dev/null || echo 1)
          if [[ "$overall" != "0" ]]; then
            echo "::error::At least one gh-aw matrix entry reported failures (see status report issue)"
            exit 1
          fi