From b5231d765a8dd7bbc71bb47d64fbc3dfb184153b Mon Sep 17 00:00:00 2001 From: Greg Jackson Date: Mon, 30 Mar 2026 01:56:31 +0100 Subject: [PATCH 1/2] feat(review): add PR type triage and non-application review categories Add Step 3.5 to classify PRs by type (application, CI/infra, scripts, config, docs, tests, mixed) and guide which review categories to prioritize. Prevents false-confidence N/A results on non-application PRs. Add three new Pass 2 checklist categories: - Script & Shell Quality: shell options, shebangs, quoting, portability - Platform & Convention Consistency: toolchain matching, naming conventions - Configuration & Infrastructure Safety: hardcoded secrets, CI triggers Closes #356 Co-Authored-By: Claude Opus 4.6 (1M context) --- review/SKILL.md | 38 ++++++++++++++++++++++++++++++++++++++ review/SKILL.md.tmpl | 38 ++++++++++++++++++++++++++++++++++++++ review/checklist.md | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 114 insertions(+), 1 deletion(-) diff --git a/review/SKILL.md b/review/SKILL.md index 52560d774..305e852d9 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -591,6 +591,44 @@ Run `git diff origin/` to get the full diff. This includes both committed --- +## Step 3.5: Classify PR type + +Before applying the checklist, classify the PR based on the files changed in the diff stat: + +1. Count files by category from the `git diff --stat` output: + - **Application code:** `.ts`, `.js`, `.py`, `.rb`, `.go`, `.rs`, `.java`, `.tsx`, `.jsx`, `.vue`, `.svelte` (source files in `src/`, `lib/`, `app/`, or similar) + - **CI/Infra:** `.yml`, `.yaml` in `.github/`, `.circleci/`, `.gitlab-ci/`; `Dockerfile*`, `docker-compose*`, `Makefile`, `Justfile` + - **Scripts:** `.sh`, `.bash`, `.zsh`, `.bat`, `.ps1`, `.cmd` files; files in `bin/`, `scripts/` directories + - **Config:** `.json`, `.yaml`, `.yml`, `.toml`, `.ini`, `.env*` config files (not in CI dirs) + - **Docs:** `.md`, `.txt`, `.rst` files + - **Tests:** files in `test/`, `tests/`, `spec/`, `__tests__/` directories; files matching `*.test.*`, `*.spec.*` + +2. Determine the **primary PR type** based on which category has the most changed files: + - `APPLICATION` — majority application code + - `CI_INFRA` — majority CI/infra files + - `SCRIPTS` — majority scripts + - `CONFIG` — majority config files + - `DOCS` — majority documentation + - `TESTS` — majority test files + - `MIXED` — no single category >50% of changed files + +3. Output (before the two-pass review): + ``` + PR Type: [TYPE] ([N] app, [N] ci, [N] scripts, [N] config, [N] docs, [N] tests) + ``` + +4. **Category relevance guide:** + - `APPLICATION` or `MIXED`: Run ALL checklist categories (current behavior) + - `CI_INFRA`: Prioritize Distribution & CI/CD Pipeline, Configuration & Infrastructure Safety, Script & Shell Quality. Skip SQL, LLM, View/Frontend, Type Coercion categories. + - `SCRIPTS`: Prioritize Script & Shell Quality, Platform & Convention Consistency, Shell Injection. Skip SQL, LLM, View/Frontend categories. + - `CONFIG`: Prioritize Configuration & Infrastructure Safety, Platform & Convention Consistency. Skip most application categories. + - `DOCS`: Check Dead Code & Consistency (stale docs), Platform & Convention Consistency (naming). Skip all code categories. + - `TESTS`: Run Test Gaps, Completeness Gaps, and application categories relevant to what's being tested. + + **Important:** "Skip" means "don't spend time on categories that will obviously return N/A." If a skipped category is unexpectedly relevant (e.g., a CI file that also contains SQL), still flag it. Use judgment — the classification is a guide, not a hard rule. + +--- + ## Prior Learnings Search for relevant learnings from previous sessions: diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index fa14f26a7..8618f2e03 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -104,6 +104,44 @@ Run `git diff origin/` to get the full diff. This includes both committed --- +## Step 3.5: Classify PR type + +Before applying the checklist, classify the PR based on the files changed in the diff stat: + +1. Count files by category from the `git diff --stat` output: + - **Application code:** `.ts`, `.js`, `.py`, `.rb`, `.go`, `.rs`, `.java`, `.tsx`, `.jsx`, `.vue`, `.svelte` (source files in `src/`, `lib/`, `app/`, or similar) + - **CI/Infra:** `.yml`, `.yaml` in `.github/`, `.circleci/`, `.gitlab-ci/`; `Dockerfile*`, `docker-compose*`, `Makefile`, `Justfile` + - **Scripts:** `.sh`, `.bash`, `.zsh`, `.bat`, `.ps1`, `.cmd` files; files in `bin/`, `scripts/` directories + - **Config:** `.json`, `.yaml`, `.yml`, `.toml`, `.ini`, `.env*` config files (not in CI dirs) + - **Docs:** `.md`, `.txt`, `.rst` files + - **Tests:** files in `test/`, `tests/`, `spec/`, `__tests__/` directories; files matching `*.test.*`, `*.spec.*` + +2. Determine the **primary PR type** based on which category has the most changed files: + - `APPLICATION` — majority application code + - `CI_INFRA` — majority CI/infra files + - `SCRIPTS` — majority scripts + - `CONFIG` — majority config files + - `DOCS` — majority documentation + - `TESTS` — majority test files + - `MIXED` — no single category >50% of changed files + +3. Output (before the two-pass review): + ``` + PR Type: [TYPE] ([N] app, [N] ci, [N] scripts, [N] config, [N] docs, [N] tests) + ``` + +4. **Category relevance guide:** + - `APPLICATION` or `MIXED`: Run ALL checklist categories (current behavior) + - `CI_INFRA`: Prioritize Distribution & CI/CD Pipeline, Configuration & Infrastructure Safety, Script & Shell Quality. Skip SQL, LLM, View/Frontend, Type Coercion categories. + - `SCRIPTS`: Prioritize Script & Shell Quality, Platform & Convention Consistency, Shell Injection. Skip SQL, LLM, View/Frontend categories. + - `CONFIG`: Prioritize Configuration & Infrastructure Safety, Platform & Convention Consistency. Skip most application categories. + - `DOCS`: Check Dead Code & Consistency (stale docs), Platform & Convention Consistency (naming). Skip all code categories. + - `TESTS`: Run Test Gaps, Completeness Gaps, and application categories relevant to what's being tested. + + **Important:** "Skip" means "don't spend time on categories that will obviously return N/A." If a skipped category is unexpectedly relevant (e.g., a CI file that also contains SQL), still flag it. Use judgment — the classification is a guide, not a hard rule. + +--- + {{LEARNINGS_SEARCH}} ## Step 4: Two-pass review diff --git a/review/checklist.md b/review/checklist.md index cfedcf81f..93da21953 100644 --- a/review/checklist.md +++ b/review/checklist.md @@ -154,6 +154,40 @@ To do this: use Grep to find all references to the sibling values (e.g., grep fo - Internal tools not distributed outside the team - Test-only CI changes (adding test steps, not publish steps) +#### Script & Shell Quality +- Shell scripts without `set -euo pipefail` (or individual `set -e`, `set -u`) — failures silently ignored +- Missing or incorrect shebang lines (`#!/usr/bin/env bash` preferred over `#!/bin/bash` for portability) +- Unquoted variables in shell scripts (`$VAR` instead of `"$VAR"`) — word splitting and globbing bugs +- `eval` usage with variable interpolation — injection risk +- Platform-specific commands without guards (e.g., `sed -i ''` is macOS-only, Linux needs `sed -i`) +- Scripts missing `chmod +x` or not matching the executable permissions of sibling scripts in the same directory + +**DO NOT flag:** +- Scripts that are explicitly single-platform (documented in comments or filename like `*.bat`) +- Helper scripts only run via a parent script that already sets shell options + +#### Platform & Convention Consistency +- New files using a different toolchain than existing files in the same directory (e.g., `.bat` script added to a directory of `.sh` scripts, `Makefile` added alongside existing `package.json` scripts) +- CI workflow targeting an OS not covered by the project's test matrix or existing CI jobs (e.g., Windows step in a Linux-only CI) +- New config files duplicating settings already managed by an existing config (e.g., adding `.prettierrc` when `package.json` already has prettier config) +- Naming convention violations: new files not matching the case/separator pattern of sibling files (e.g., `myScript.sh` among `kebab-case.sh` files) + +**DO NOT flag:** +- Cross-platform additions that are clearly intentional (documented in PR or commit message) +- Files in new directories that establish their own conventions + +#### Configuration & Infrastructure Safety +- Secrets or tokens hardcoded in CI workflows — must use `${{ secrets.X }}` or environment variable references +- CI workflows with `pull_request_target` trigger without explicit commit SHA pinning — allows arbitrary code execution from forks +- Docker images pinned by mutable tag (`:latest`, `:main`) instead of SHA digest in CI — supply chain risk +- Overly broad file permissions in scripts (`chmod 777` or world-writable) +- Missing `.gitignore` entries for new generated/build artifact directories +- Environment-specific paths hardcoded (e.g., `/home/runner/`, `/Users/`) instead of using variables + +**DO NOT flag:** +- Development-only Docker compose files using `:latest` +- Local development scripts with relaxed permissions + --- ## Severity Classification @@ -171,7 +205,10 @@ CRITICAL (highest severity): INFORMATIONAL (lower severity): ├─ Type Coercion at Boundaries ├─ View/Frontend ├─ Performance & Bundle Impact - └─ Distribution & CI/CD Pipeline + ├─ Distribution & CI/CD Pipeline + ├─ Script & Shell Quality + ├─ Platform & Convention Consistency + └─ Configuration & Infrastructure Safety All findings are actioned via Fix-First Review. Severity determines presentation order and classification of AUTO-FIX vs ASK — critical From 63b078f4a0b6624856cad55d2ed7e08d3225e4e7 Mon Sep 17 00:00:00 2001 From: Greg Jackson Date: Mon, 30 Mar 2026 01:58:11 +0100 Subject: [PATCH 2/2] test(review): add E2E test for CI/infra PR blindspot detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plants 3 issues in a CI workflow (pull_request_target without SHA pinning, hardcoded token, chmod 777 on Windows runner) and verifies the review catches at least 2 of 3. Registered as gate tier — CI security issues should always be caught. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/fixtures/review-eval-ci-baseline.yml | 17 +++++ test/fixtures/review-eval-ci-blindspot.yml | 27 +++++++ test/helpers/touchfiles.ts | 2 + test/skill-e2e-review.test.ts | 85 ++++++++++++++++++++++ 4 files changed, 131 insertions(+) create mode 100644 test/fixtures/review-eval-ci-baseline.yml create mode 100644 test/fixtures/review-eval-ci-blindspot.yml diff --git a/test/fixtures/review-eval-ci-baseline.yml b/test/fixtures/review-eval-ci-baseline.yml new file mode 100644 index 000000000..f19b70303 --- /dev/null +++ b/test/fixtures/review-eval-ci-baseline.yml @@ -0,0 +1,17 @@ +name: CI +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '20' + - run: npm ci + - run: npm test diff --git a/test/fixtures/review-eval-ci-blindspot.yml b/test/fixtures/review-eval-ci-blindspot.yml new file mode 100644 index 000000000..0d28dbc6b --- /dev/null +++ b/test/fixtures/review-eval-ci-blindspot.yml @@ -0,0 +1,27 @@ +name: CI +on: + push: + branches: [main] + pull_request_target: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '20' + - run: npm ci + - run: npm test + + deploy: + runs-on: windows-latest + needs: test + steps: + - uses: actions/checkout@v4 + - run: | + TOKEN="ghp_abc123secrettoken" + curl -H "Authorization: token $TOKEN" https://api.github.com/repos + - run: chmod 777 deploy.sh && ./deploy.sh diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index b475daad7..b4620356d 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -58,6 +58,7 @@ export const E2E_TOUCHFILES: Record = { 'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'], 'review-base-branch': ['review/**'], 'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'], + 'review-ci-blindspot': ['review/**', 'test/fixtures/review-eval-ci-*'], // Office Hours 'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'], @@ -199,6 +200,7 @@ export const E2E_TIERS: Record = { 'review-enum-completeness': 'gate', 'review-base-branch': 'gate', 'review-design-lite': 'periodic', // 4/7 threshold is subjective + 'review-ci-blindspot': 'gate', // Security guardrail: CI blindspot detection 'review-coverage-audit': 'gate', 'review-plan-completion': 'gate', 'review-dashboard-via': 'gate', diff --git a/test/skill-e2e-review.test.ts b/test/skill-e2e-review.test.ts index dacd4b166..e9714cd51 100644 --- a/test/skill-e2e-review.test.ts +++ b/test/skill-e2e-review.test.ts @@ -252,6 +252,91 @@ Important: The design checklist should catch issues like blacklisted fonts, smal }, 300_000); }); +// --- Review: CI/infra blindspot E2E --- + +describeIfSelected('Review CI blindspot E2E', ['review-ci-blindspot'], () => { + let ciDir: string; + + beforeAll(() => { + ciDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-ci-blindspot-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: ciDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Commit clean CI baseline on main + fs.mkdirSync(path.join(ciDir, '.github', 'workflows'), { recursive: true }); + const baseline = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-ci-baseline.yml'), 'utf-8'); + fs.writeFileSync(path.join(ciDir, '.github', 'workflows', 'ci.yml'), baseline); + + // Add a few bash scripts to establish repo conventions + fs.mkdirSync(path.join(ciDir, 'bin'), { recursive: true }); + fs.writeFileSync(path.join(ciDir, 'bin', 'test.sh'), '#!/usr/bin/env bash\nset -euo pipefail\nnpm test\n'); + fs.writeFileSync(path.join(ciDir, 'bin', 'build.sh'), '#!/usr/bin/env bash\nset -euo pipefail\nnpm run build\n'); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial CI and scripts']); + + // Feature branch with planted CI issues + run('git', ['checkout', '-b', 'feature/update-ci']); + const blindspot = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-ci-blindspot.yml'), 'utf-8'); + fs.writeFileSync(path.join(ciDir, '.github', 'workflows', 'ci.yml'), blindspot); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'update CI workflow']); + + // Copy review skill files + fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(ciDir, 'review-SKILL.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(ciDir, 'review-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(ciDir, 'review-greptile-triage.md')); + }); + + afterAll(() => { + try { fs.rmSync(ciDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('review-ci-blindspot', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on branch feature/update-ci with changes against main. +Read review-SKILL.md for the review workflow instructions. +Also read review-checklist.md and apply it. +Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to the review. +Run /review on the current diff (git diff main...HEAD). + +The diff modifies a CI workflow file. Pay attention to PR type classification and CI-specific review categories. +Write your review findings to ${ciDir}/review-output.md`, + workingDirectory: ciDir, + maxTurns: 20, + timeout: 180_000, + testName: 'review-ci-blindspot', + runId, + }); + + logCost('/review ci-blindspot', result); + recordE2E(evalCollector, '/review CI blindspot', 'Review CI blindspot E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify the review caught at least 2 of 3 planted issues + const reviewPath = path.join(ciDir, 'review-output.md'); + if (fs.existsSync(reviewPath)) { + const review = fs.readFileSync(reviewPath, 'utf-8').toLowerCase(); + let detected = 0; + + // Issue 1: pull_request_target security risk + if (review.includes('pull_request_target') || review.includes('fork') || review.includes('arbitrary code')) detected++; + // Issue 2: Hardcoded secret/token + if (review.includes('hardcoded') || review.includes('secret') || review.includes('token') || review.includes('ghp_')) detected++; + // Issue 3: chmod 777 or Windows runner inconsistency + if (review.includes('777') || review.includes('windows') || review.includes('permission') || review.includes('platform')) detected++; + + console.log(`CI blindspot review detected ${detected}/3 planted issues`); + expect(detected).toBeGreaterThanOrEqual(2); + } + }, 210_000); +}); + // --- Base branch detection smoke tests --- describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {