Skip to content

e2e

e2e #302

Workflow file for this run

name: e2e
# Compile-compatibility matrix that runs the E2E suite against multiple gh-aw
# refs × samples modes. The scheduled (nightly) run exercises the full matrix:
# 1. github/gh-aw `main` (source mode)
# 2. the latest gh-aw pre-release tag (extension mode)
# 3. the latest gh-aw stable release (extension mode)
# each crossed with samples=false (live engine) and samples=true (deterministic).
#
# Execution model
# ---------------
# * SERIAL: the matrix runs with `max-parallel: 1`. Each entry itself dispatches
# dozens of test workflows, so running entries concurrently would flood GitHub
# Actions. Entries are therefore queued one at a time (`fail-fast: false` so a
# single entry's failure doesn't cancel the rest).
# * RUNS FROM MAIN: each entry recompiles its workflows, pushes them to `main`,
# and dispatches every test from `main`. Event-triggered tests (add-comment,
# add-labels, update-issue, etc.) create their own fixtures and wait for the
# GitHub event to fire, exactly as they do when run locally.
#
# Manual dispatch
# ---------------
# `workflow_dispatch` exposes a `selection` choice so a human can run a single
# combination quickly instead of the whole matrix. The default is `quick`
# (main / source / samples=true). `full` runs the entire matrix serially, same
# as the nightly schedule.
#
# Because `$CI` is set to `true` by GitHub Actions, `e2e.sh` runs in CI mode: it
# does NOT mutate the repository's `TEMP_USER_PAT` secret (it relies on the
# pre-configured repo secret + the `GH_AW_TEST_PAT` env var instead).
#
# Required secrets:
# * GH_AW_TEST_PAT (required) — PAT used for all operations except copilot requests
on:
schedule:
- cron: '0 3 * * *' # Nightly at 03:00 UTC
workflow_dispatch:
inputs:
selection:
description: 'Which part of the matrix to run'
required: false
type: choice
default: quick
options:
- quick # main / source / samples=true (fast, deterministic)
- main-live # main / source / samples=false (live engine path)
- release # latest release / extension / samples=true
- prerelease # latest pre-release / extension / samples=true
- full # full matrix, run serially
gh_aw_ref:
description: 'github/gh-aw branch, tag, or SHA to test in source mode (e.g. main, a feature branch, or a commit SHA). Applies to the quick / main-live selections; defaults to main.'
required: false
default: ''
refs:
description: 'Optional comma-separated list of gh-aw refs to test (overrides main / latest pre-release / latest release auto-detection). Only used when selection=full.'
required: false
default: ''
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: true
permissions:
contents: read
issues: write
pull-requests: read
actions: read
jobs:
resolve-refs:
name: Resolve gh-aw refs to test
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.resolve.outputs.matrix }}
main_sha: ${{ steps.resolve.outputs.main_sha }}
latest_release: ${{ steps.resolve.outputs.latest_release }}
latest_prerelease: ${{ steps.resolve.outputs.latest_prerelease }}
steps:
- name: Resolve refs
id: resolve
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
OVERRIDE: ${{ inputs.refs }}
# Optional github/gh-aw branch/tag/SHA to test in source mode. Applies
# to the single-entry quick / main-live selections; empty => main.
GH_AW_REF_INPUT: ${{ inputs.gh_aw_ref }}
# On `schedule` (and any non-dispatch event) there is no selection
# input, so default to the full matrix. On workflow_dispatch this is
# the user's chosen option (default `quick`).
SELECTION: ${{ github.event_name == 'workflow_dispatch' && inputs.selection || 'full' }}
run: |
set -euo pipefail
main_sha=$(gh api repos/github/gh-aw/commits/main --jq '.sha' | cut -c1-12)
latest_release=$(gh release list --repo github/gh-aw --limit 50 \
--json tagName,isPrerelease,isDraft \
--jq '[.[] | select(.isPrerelease==false and .isDraft==false)][0].tagName' || echo "")
latest_prerelease=$(gh release list --repo github/gh-aw --limit 50 \
--json tagName,isPrerelease,isDraft \
--jq '[.[] | select(.isPrerelease==true and .isDraft==false)][0].tagName' || echo "")
if [[ -z "$latest_release" ]]; then
echo "::error::Could not resolve latest gh-aw release"
exit 1
fi
if [[ -z "$latest_prerelease" ]]; then
latest_prerelease="$latest_release"
fi
echo "Selection: $SELECTION"
# The single-entry source-mode selections (quick / main-live) test a
# specific github/gh-aw ref. Default to `main`, but allow the dispatch
# input `gh_aw_ref` to point them at any branch, tag, or SHA. The label
# is slugified from the ref so job titles / branches stay readable.
src_ref="${GH_AW_REF_INPUT:-main}"
[[ -z "$src_ref" ]] && src_ref="main"
src_label=$(echo "$src_ref" | tr '/' '-' | tr -cd 'A-Za-z0-9._-')
echo "Source-mode ref: $src_ref (label: $src_label)"
# Each matrix entry is a single, fully-specified combination:
# label : short human-readable name used in job titles / report
# ref : git ref / tag passed to the build or install step
# mode : 'source' => check out github/gh-aw at <ref>, `make build`,
# then run e2e.sh --gh-aw-ref <ref>.
# 'extension' => `gh extension install github/gh-aw --pin <ref>`,
# then run e2e.sh with NO --gh-aw-ref.
# samples : true => run with --use-samples (deterministic, no engine)
# false => run against the live AI engine
#
# The job that consumes this matrix runs serially (max-parallel: 1) and
# each entry pushes to its own dedicated branch, so entries never race
# each other or main.
#
# `full` builds the complete (ref × samples) product. The single-entry
# selections pick exactly one combination for a fast manual dispatch.
case "$SELECTION" in
quick)
matrix=$(jq -nc --arg ref "$src_ref" --arg label "$src_label" \
'[{ label: $label, ref: $ref, mode: "source", samples: true }]')
;;
main-live)
matrix=$(jq -nc --arg ref "$src_ref" --arg label "$src_label" \
'[{ label: $label, ref: $ref, mode: "source", samples: false }]')
;;
release)
matrix=$(jq -nc --arg rel "$latest_release" \
'[{ label: "release", ref: $rel, mode: "extension", samples: true }]')
;;
prerelease)
matrix=$(jq -nc --arg pre "$latest_prerelease" \
'[{ label: "pre-release", ref: $pre, mode: "extension", samples: true }]')
;;
full|*)
# Override format (workflow_dispatch input `refs`): comma-separated
# tokens. Each token may be a bare ref (mode=source) or
# "<ref>:source" / "<ref>:extension" to force a mode.
if [[ -n "$OVERRIDE" ]]; then
base=$(echo "$OVERRIDE" | jq -Rc '
split(",")
| map(gsub("^\\s+|\\s+$";""))
| map(select(length>0))
| map(
if test(":") then
(split(":") | { label: .[0], ref: .[0], mode: (.[1] // "source") })
else
{ label: ., ref: ., mode: "source" }
end
)')
else
base=$(jq -nc \
--arg pre "$latest_prerelease" \
--arg rel "$latest_release" \
'[
{ label: "main", ref: "main", mode: "source" },
{ label: "pre-release", ref: $pre, mode: "extension" },
{ label: "release", ref: $rel, mode: "extension" }
]
# de-dupe by ref so pre-release==release does not run twice
| unique_by(.ref)')
fi
# Expand each base entry across samples modes:
# main → samples=false AND samples=true (inference + deterministic)
# pre-release/release → samples=true only (skip inference runs for speed)
matrix=$(echo "$base" | jq -c '[ .[] as $e | if $e.label == "main" then (false, true) else true end | $e + { samples: . } ]')
;;
esac
echo "Resolved:"
echo " main: main ($main_sha)"
echo " latest pre-release: $latest_prerelease"
echo " latest release: $latest_release"
echo " matrix: $matrix"
{
echo "main_sha=$main_sha"
echo "latest_release=$latest_release"
echo "latest_prerelease=$latest_prerelease"
echo "matrix=$matrix"
} >> "$GITHUB_OUTPUT"
e2e:
name: "E2E: ${{ matrix.entry.label }} (gh-aw@${{ matrix.entry.ref }}, ${{ matrix.entry.mode }}, samples=${{ matrix.entry.samples }})"
needs: resolve-refs
runs-on: ubuntu-latest
timeout-minutes: 90
strategy:
# Run entries one at a time so the suite never floods GitHub Actions with
# concurrent dispatched test runs (each entry itself dispatches many
# workflows). fail-fast is off so one entry's failure doesn't cancel the
# rest of the (serial) queue.
fail-fast: false
max-parallel: 1
matrix:
entry: ${{ fromJSON(needs.resolve-refs.outputs.matrix) }}
steps:
- name: Checkout gh-aw-test
uses: actions/checkout@v4
with:
# Always checkout the latest HEAD of main, not the trigger SHA.
# When e2e.yml is triggered by a push, `github.sha` is the pushed
# commit, but by the time the job runs (serial matrix, queued), a
# prior matrix entry may have pushed a recompile commit on top of
# it. Without `ref: main`, the checkout is behind origin/main and
# `git push` fails with non-fast-forward.
ref: main
path: gh-aw-test
fetch-depth: 0
# Use the PAT so e2e.sh can push the recompiled workflows to main
# (the default GITHUB_TOKEN only has contents:read in this workflow).
token: ${{ secrets.GH_AW_TEST_PAT }}
# --- source mode: build gh-aw from a checkout ---------------------------
- name: Checkout github/gh-aw at ${{ matrix.entry.ref }} (source mode)
if: matrix.entry.mode == 'source'
uses: actions/checkout@v4
with:
repository: github/gh-aw
ref: ${{ matrix.entry.ref }}
path: gh-aw
fetch-depth: 1
- name: Set up Go (source mode)
if: matrix.entry.mode == 'source'
uses: actions/setup-go@v5
with:
go-version-file: gh-aw/go.mod
cache-dependency-path: gh-aw/go.sum
- name: Build gh-aw binary from source (source mode)
if: matrix.entry.mode == 'source'
working-directory: gh-aw
run: make build
# --- extension mode: install published gh extension at the pinned tag ---
- name: Install published gh-aw extension at ${{ matrix.entry.ref }} (extension mode)
if: matrix.entry.mode == 'extension'
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
run: |
set -euo pipefail
# Remove any pre-existing install so --pin actually wins
gh extension remove github/gh-aw 2>/dev/null || true
# Retry up to 3 times with backoff. Prior matrix entries (especially
# the 45-minute source-mode run) can exhaust the API rate limit,
# causing a transient 403 on the release-asset download.
for _attempt in 1 2 3; do
if gh extension install github/gh-aw --pin "${{ matrix.entry.ref }}"; then
break
fi
if [[ "$_attempt" -eq 3 ]]; then
echo "::error::Failed to install gh-aw extension after 3 attempts"
exit 1
fi
echo "Install attempt $_attempt failed; waiting 90s for rate limit to recover..."
sleep 90
done
gh aw --version
# --- shared steps -------------------------------------------------------
- name: Set up Node.js
uses: actions/setup-node@v5
- name: gh auth status
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
run: gh auth status
- name: Run E2E against gh-aw@${{ matrix.entry.ref }} (${{ matrix.entry.mode }} mode)
id: run
working-directory: gh-aw-test
timeout-minutes: 80
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
GITHUB_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
GH_AW_TEST_PAT: ${{ secrets.GH_AW_TEST_PAT }}
E2E_REF: ${{ matrix.entry.ref }}
E2E_MODE: ${{ matrix.entry.mode }}
E2E_USE_SAMPLES: ${{ matrix.entry.samples }}
run: |
set +e
chmod +x ./e2e.sh
if [[ -z "${GH_AW_TEST_PAT:-}" ]]; then
echo "::error::GH_AW_TEST_PAT secret is not set on this repository. The matrix runs e2e.sh in mode, which does not mutate repo secrets and therefore requires GH_AW_TEST_PAT to be supplied via secrets.GH_AW_TEST_PAT."
exit 1
fi
# e2e.sh commits the recompiled workflows and pushes them to main;
# CI runners have no default git identity.
git config user.name "gh-aw-test e2e bot"
git config user.email "gh-aw-test-e2e@users.noreply.github.com"
# Slug uniquely identifies this (ref × samples) combination and is used
# for the per-entry report files.
slug=$(echo "${E2E_REF}-samples-${E2E_USE_SAMPLES}" | tr '/' '_' | tr -cd 'A-Za-z0-9._-')
# The matrix is serial (max-parallel: 1), so each entry can safely
# recompile, push to main, and dispatch every test from main without
# clobbering another entry. Running from main tests the common case
# users actually experience (and keeps create-pull-request's
# fetch-depth:1 merge-base against origin/main trivially resolvable).
# Build flags
SAMPLES_FLAG=""
if [[ "$E2E_USE_SAMPLES" == "true" ]]; then
SAMPLES_FLAG="--use-samples"
fi
# Tests to run in every matrix entry.
# Copilot suite in full; one create-issue test for Claude and Codex.
TESTS=('test-copilot-*' 'test-claude-create-issue' 'test-codex-create-issue')
if [[ "$E2E_MODE" == "source" ]]; then
# source mode: locally-built binary + --gh-aw-ref so lockfiles
# reference github/gh-aw/actions/setup@<ref>.
# ($CI=true is set automatically by GitHub Actions, so e2e.sh runs
# in CI mode: no secret mutation. Recompiled workflows are pushed
# to main.)
# --verbose streams the gh-aw build/compile/version diagnostics to
# the job log (otherwise they'd only land in the e2e-test-*.log
# artifact, making build failures opaque in CI).
./e2e.sh --gh-aw-ref "$E2E_REF" --verbose --batch-size 7 $SAMPLES_FLAG "${TESTS[@]}" 2>&1 | tee e2e-output.log
else
# extension mode: rely on the installed `gh aw` (pinned to $E2E_REF),
# no --gh-aw-ref override — lockfiles will reference the published
# gh-aw-actions release that ships with that gh-aw version, which is
# exactly what end users get.
./e2e.sh --verbose --batch-size 7 $SAMPLES_FLAG "${TESTS[@]}" 2>&1 | tee e2e-output.log
fi
rc=${PIPESTATUS[0]}
echo "exit_code=$rc" >> "$GITHUB_OUTPUT"
mkdir -p ../report
if [[ -s fails.txt ]]; then
cp fails.txt "../report/fails-${slug}.txt"
else
: > "../report/fails-${slug}.txt"
fi
if [[ -s passes.txt ]]; then
cp passes.txt "../report/passes-${slug}.txt"
else
: > "../report/passes-${slug}.txt"
fi
printf '%s' "$E2E_REF" > "../report/ref-${slug}.txt"
printf '%s' "$E2E_MODE" > "../report/mode-${slug}.txt"
printf '%s' "$rc" > "../report/rc-${slug}.txt"
# Don't fail the step here — let the report job decide overall pass/fail
# so the aggregator always runs and the artifact upload happens.
exit 0
- name: Append job summary
if: always()
working-directory: gh-aw-test
run: |
{
echo "## E2E: ${{ matrix.entry.label }} — gh-aw@${{ matrix.entry.ref }} (${{ matrix.entry.mode }} mode, samples=${{ matrix.entry.samples }})"
echo
echo "Exit code: \`${{ steps.run.outputs.exit_code }}\`"
echo
if [[ -s fails.txt ]]; then
echo "### Failed tests"
echo '```'
cat fails.txt
echo '```'
else
echo "_No failed tests recorded._"
fi
} >> "$GITHUB_STEP_SUMMARY"
- name: Upload artifacts for ${{ matrix.entry.label }} (samples=${{ matrix.entry.samples }})
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-${{ matrix.entry.label }}-samples-${{ matrix.entry.samples }}
path: |
gh-aw-test/e2e-test-*.log
gh-aw-test/e2e-output.log
gh-aw-test/fails.txt
gh-aw-test/passes.txt
report/
report:
name: Publish status report issue
needs: [resolve-refs, e2e]
if: always()
runs-on: ubuntu-latest
permissions:
issues: write
contents: read
steps:
- name: Download all matrix artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Build status report body
env:
MATRIX_JSON: ${{ needs.resolve-refs.outputs.matrix }}
MAIN_SHA: ${{ needs.resolve-refs.outputs.main_sha }}
LATEST_RELEASE: ${{ needs.resolve-refs.outputs.latest_release }}
LATEST_PRERELEASE: ${{ needs.resolve-refs.outputs.latest_prerelease }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SERVER_URL: ${{ github.server_url }}
REPO: ${{ github.repository }}
E2E_RESULT: ${{ needs.e2e.result }}
run: |
set -euo pipefail
# ---------------------------------------------------------------
# Pass 1: gather per-entry data into TSV accumulators so we can
# render an aggregate summary and a failures-first report.
# ---------------------------------------------------------------
entries_tsv=$(mktemp) # label \t ref \t mode \t samples \t rc \t count \t status
failed_tests_tsv=$(mktemp) # test \t label \t ref \t mode \t samples \t run_url
passed_tests_tsv=$(mktemp) # test \t label \t ref \t mode \t samples \t run_url
total_entries=0
entries_passed=0
entries_failed=0
entries_unknown=0
total_failed_tests=0
overall_failed=0
while IFS=$'\t' read -r label ref mode samples; do
[[ -z "$label" ]] && continue
total_entries=$((total_entries + 1))
slug=$(echo "${ref}-samples-${samples}" | tr '/' '_' | tr -cd 'A-Za-z0-9._-')
art="artifacts/e2e-${label}-samples-${samples}"
rc_file="${art}/report/rc-${slug}.txt"
fails_file="${art}/report/fails-${slug}.txt"
if [[ -f "$rc_file" ]]; then
rc=$(cat "$rc_file")
else
rc="?"
fi
count=0
if [[ -s "$fails_file" ]]; then
count=$(grep -c . "$fails_file" || true)
# Collect each failed test + a link to its latest run.
while IFS= read -r fline; do
[[ -z "$fline" ]] && continue
tname="${fline%% *}"
run_url=""
if [[ "$fline" == *" "* ]]; then
last_id="${fline##* }"
run_url="${SERVER_URL}/${REPO}/actions/runs/${last_id}"
fi
printf '%s\t%s\t%s\t%s\t%s\t%s\n' \
"$tname" "$label" "$ref" "$mode" "$samples" "$run_url" >> "$failed_tests_tsv"
done < "$fails_file"
fi
# Collect each passed test + a link to its run.
passes_file="${art}/report/passes-${slug}.txt"
if [[ -s "$passes_file" ]]; then
while IFS= read -r fline; do
[[ -z "$fline" ]] && continue
tname="${fline%% *}"
run_url=""
if [[ "$fline" == *" "* ]]; then
last_id="${fline##* }"
run_url="${SERVER_URL}/${REPO}/actions/runs/${last_id}"
fi
printf '%s\t%s\t%s\t%s\t%s\t%s\n' \
"$tname" "$label" "$ref" "$mode" "$samples" "$run_url" >> "$passed_tests_tsv"
done < "$passes_file"
fi
# Determine entry status.
if [[ "$rc" == "?" ]]; then
status="⚪ unknown"
entries_unknown=$((entries_unknown + 1))
overall_failed=1
elif [[ "$rc" == "0" && "$count" -eq 0 ]]; then
status="✅ pass"
entries_passed=$((entries_passed + 1))
else
status="❌ fail"
entries_failed=$((entries_failed + 1))
overall_failed=1
fi
total_failed_tests=$((total_failed_tests + count))
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
"$label" "$ref" "$mode" "$samples" "$rc" "$count" "$status" >> "$entries_tsv"
done < <(echo "$MATRIX_JSON" | jq -r '.[] | [.label, .ref, .mode, (.samples|tostring)] | @tsv')
if [[ "$overall_failed" == "0" ]]; then
headline="✅ All ${total_entries} matrix entr$([[ $total_entries -eq 1 ]] && echo y || echo ies) passed"
else
headline="❌ ${entries_failed} of ${total_entries} matrix entr$([[ $total_entries -eq 1 ]] && echo y || echo ies) failed (${total_failed_tests} failed test run(s))"
fi
# ---------------------------------------------------------------
# Build lookup tables for the per-test matrix view.
# ---------------------------------------------------------------
declare -A _ref_for_label=() # label -> ref
declare -A _cell_status=() # "samples:test:label" -> "pass" or "fail"
declare -A _cell_url=() # "samples:test:label" -> run URL
while IFS=$'\t' read -r label ref mode samples rc count status; do
[[ -z "$label" ]] && continue
_ref_for_label["$label"]="$ref"
done < "$entries_tsv"
while IFS=$'\t' read -r tname label ref mode samples run_url; do
[[ -z "$tname" ]] && continue
_cell_status["${samples}:${tname}:${label}"]="fail"
_cell_url["${samples}:${tname}:${label}"]="$run_url"
done < <(sort -f "$failed_tests_tsv")
while IFS=$'\t' read -r tname label ref mode samples run_url; do
[[ -z "$tname" ]] && continue
_cell_status["${samples}:${tname}:${label}"]="pass"
_cell_url["${samples}:${tname}:${label}"]="$run_url"
done < <(sort -f "$passed_tests_tsv")
# Helper: format a ref value as a markdown link.
_make_ref_link() {
local _r="$1"
if [[ "$_r" == "main" ]]; then
echo "[main](${SERVER_URL}/github/gh-aw/commit/${MAIN_SHA})"
else
echo "[${_r}](${SERVER_URL}/github/gh-aw/releases/tag/${_r})"
fi
}
# Render a test × ref matrix table.
# Args: samples_value ("true"|"false"), type ("errors"|"successes"), heading
_render_matrix_table() {
local _sv="$1" # "true" or "false"
local _type="$2" # "errors" or "successes"
local _hdg="$3"
# Ordered columns: release → pre-release → main, then any others.
local _cols=()
for _lbl in "release" "pre-release" "main"; do
[[ -n "${_ref_for_label[$_lbl]:-}" ]] && _cols+=("$_lbl")
done
for _lbl in "${!_ref_for_label[@]}"; do
case "$_lbl" in release|pre-release|main) ;; *)
_cols+=("$_lbl") ;;
esac
done
[[ "${#_cols[@]}" -eq 0 ]] && return 0
local _tests
if [[ "$_type" == "errors" ]]; then
_tests=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$failed_tests_tsv" | sort -u)
else
local _fp _ff
_fp=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$passed_tests_tsv" | sort -u)
_ff=$(awk -F'\t' -v sv="$_sv" '$5==sv{print $1}' "$failed_tests_tsv" | sort -u)
if [[ -z "$_fp" ]]; then
_tests=""
elif [[ -z "$_ff" ]]; then
_tests="$_fp"
else
_tests=$(comm -23 <(echo "$_fp") <(echo "$_ff"))
fi
fi
[[ -z "$_tests" ]] && return 0
echo "## ${_hdg}"
echo
local _hdr="| test |"
local _sep="| :--- |"
for _lbl in "${_cols[@]}"; do
local _ref="${_ref_for_label[$_lbl]}"
local _rlink
_rlink=$(_make_ref_link "$_ref")
_hdr+=" ${_rlink} |"
_sep+=" :---: |"
done
echo "$_hdr"
echo "$_sep"
while IFS= read -r _tname; do
[[ -z "$_tname" ]] && continue
local _test_url="${SERVER_URL}/${REPO}/blob/main/.github/workflows/${_tname}.md"
local _row="| [${_tname}](${_test_url}) |"
for _lbl in "${_cols[@]}"; do
local _key="${_sv}:${_tname}:${_lbl}"
local _st="${_cell_status[$_key]:-}"
local _ru="${_cell_url[$_key]:-}"
local _cell
if [[ "$_st" == "pass" ]]; then
[[ -n "$_ru" ]] && _cell="[✅]($_ru)" || _cell="✅"
elif [[ "$_st" == "fail" ]]; then
[[ -n "$_ru" ]] && _cell="[❌]($_ru)" || _cell="❌"
else
_cell="—"
fi
_row+=" ${_cell} |"
done
echo "$_row"
done <<< "$_tests"
echo
}
# ---------------------------------------------------------------
# Pass 2: render the markdown report.
# ---------------------------------------------------------------
{
echo "**Run:** [$RUN_URL]($RUN_URL) &nbsp;·&nbsp; **Trigger:** \`${{ github.event_name }}\` &nbsp;·&nbsp; **Generated:** \`$(date -u +%FT%TZ)\` &nbsp;·&nbsp; **Outcome:** \`$E2E_RESULT\`"
echo
_render_matrix_table "true" "errors" "❌ Test errors — samples mode"
_render_matrix_table "false" "errors" "❌ Test errors — inference mode"
_render_matrix_table "true" "successes" "✅ Test successes — samples mode"
_render_matrix_table "false" "successes" "✅ Test successes — inference mode"
# When anything failed, append a ready-to-paste prompt for a coding
# agent to triage the failures from the run logs.
if [[ "$overall_failed" != "0" ]]; then
echo "## 🤖 Agent triage prompt"
echo
echo "Copy the block below into a coding agent (e.g. Copilot) to investigate the failures:"
echo
echo '```text'
echo "You are triaging failures from the gh-aw-test E2E suite."
echo "Run: $RUN_URL"
echo "Repository under test: github/gh-aw (the gh-aw CLI/compiler)."
echo "Test harness repository: $GITHUB_REPOSITORY (this repo; runner is e2e.sh)."
echo
echo "Goal: for EACH failed test listed in this status report, access the GitHub"
echo "Actions logs for the run above (and the per-entry artifacts e2e-<label>-samples-<bool>,"
echo "which contain e2e-test-*.log, e2e-output.log and fails.txt), determine the root"
echo "cause, and categorize the failure as exactly one of:"
echo
echo " 1. TRANSIENT — flaky/infra/network/rate-limit/timing; not a real defect."
echo " Action: note it and recommend a re-run (./e2e.sh rerun)."
echo " 2. TEST-FRAMEWORK BUG — a defect in this repo's harness (e2e.sh), a workflow"
echo " source file (.github/workflows/test-*.md), a sample, or CI config."
echo " Action: propose a concrete fix (file + change) in $GITHUB_REPOSITORY."
echo " 3. GH-AW BUG — a defect in github/gh-aw itself (compiler output, runtime"
echo " engine behaviour, safe-output handling, etc.)."
echo " Action: open an issue in github/gh-aw with a minimal repro, the failing"
echo " test name, the gh-aw ref/mode/samples combination, and links to the"
echo " relevant log lines. Check for an existing open issue first and link it"
echo " instead of filing a duplicate."
echo
echo "Steps:"
echo " - Use 'gh run view <run-id> --log' and 'gh run download <run-id>' to fetch logs/artifacts."
echo " - Read AGENTS.md in $GITHUB_REPOSITORY for harness conventions before proposing fixes."
echo " - Group failures by suspected root cause; the same gh-aw bug may explain several."
echo " - Produce a table: test | category | root cause | recommended action | issue/PR link."
echo " - Only open github/gh-aw issues for category 3, and only after confirming no duplicate exists."
echo '```'
fi
echo "$overall_failed" > /tmp/overall_failed
} > report.md
rm -f "$entries_tsv" "$failed_tests_tsv" "$passed_tests_tsv"
cat report.md
- name: Ensure status-report label exists
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
run: |
gh label create e2e-status-report \
--color BFD4F2 \
--description "Automated cross-ref E2E status report" \
--repo "$GITHUB_REPOSITORY" 2>/dev/null || true
- name: Find prior open status reports
id: prior
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
run: |
set -euo pipefail
numbers=$(gh issue list \
--repo "$GITHUB_REPOSITORY" \
--label e2e-status-report \
--state open \
--limit 50 \
--json number \
--jq '[.[].number] | join(",")')
echo "numbers=$numbers" >> "$GITHUB_OUTPUT"
echo "Prior open status reports: ${numbers:-none}"
- name: Compose final issue body (with links to previous reports)
env:
PRIOR: ${{ steps.prior.outputs.numbers }}
run: |
{
cat report.md
if [[ -n "$PRIOR" ]]; then
echo
echo "## Previous status reports (closed by this run)"
IFS=',' read -ra arr <<< "$PRIOR"
for n in "${arr[@]}"; do
echo "* #$n"
done
fi
} > issue-body.md
- name: Create new status report issue
id: create
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
run: |
set -euo pipefail
title="E2E status report — $(date -u +%F) (gh-aw matrix)"
url=$(gh issue create \
--repo "$GITHUB_REPOSITORY" \
--title "$title" \
--label e2e-status-report \
--body-file issue-body.md)
number="${url##*/}"
echo "Created: $url"
echo "number=$number" >> "$GITHUB_OUTPUT"
echo "url=$url" >> "$GITHUB_OUTPUT"
- name: Close previous status report issues
if: steps.prior.outputs.numbers != ''
env:
GH_TOKEN: ${{ secrets.GH_AW_TEST_PAT }}
PRIOR: ${{ steps.prior.outputs.numbers }}
NEW_NUMBER: ${{ steps.create.outputs.number }}
NEW_URL: ${{ steps.create.outputs.url }}
run: |
IFS=',' read -ra arr <<< "$PRIOR"
for n in "${arr[@]}"; do
gh issue comment "$n" \
--repo "$GITHUB_REPOSITORY" \
--body "Superseded by #${NEW_NUMBER} — ${NEW_URL}"
gh issue close "$n" \
--repo "$GITHUB_REPOSITORY" \
--reason "not planned"
done
- name: Fail the workflow if any matrix entry reported failures
run: |
overall=$(cat /tmp/overall_failed 2>/dev/null || echo 1)
if [[ "$overall" != "0" ]]; then
echo "::error::At least one gh-aw matrix entry reported failures (see status report issue)"
exit 1
fi