diff --git a/demo/substack-spec-v01.dot b/demo/substack-spec-v01.dot index 486eb721..1bcc4e10 100644 --- a/demo/substack-spec-v01.dot +++ b/demo/substack-spec-v01.dot @@ -81,7 +81,7 @@ digraph substack_spec_v01 { max_agent_turns=300, max_tokens=32768, label="Implement", - prompt="Goal: $goal\n\nRead .ai/runs/$KILROY_RUN_ID/plan_final.md, .ai/runs/$KILROY_RUN_ID/spec.md, and .ai/runs/$KILROY_RUN_ID/definition_of_done.md. If the spec or DoD files do not exist at those paths, fall back to reading substack-spec-v01.md and substack-dod-v01.md directly.\n\nBEFORE ANYTHING ELSE: check if .ai/verify_errors.log exists. If it does, read it — it contains the exact commands that failed and their error output from the verify chain. Fix every error listed in that file, then delete .ai/verify_errors.log when all fixes are applied. Do NOT regenerate working code — only fix the specific errors.\n\nAlso check if .ai/runs/$KILROY_RUN_ID/verify_fidelity.md exists. If it does, read it — it contains per-AC pass/fail verdicts from the fidelity check. Fix every failing AC listed in that file.\n\nIf .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, read it and fix ONLY identified gaps — do NOT regenerate working code. On repair passes, read and fix existing files rather than skipping them.\n\nImplement the complete Substack Creator Newsletter Engine as a single pass. On a fresh pass (no postmortem), check if target files already exist on disk and are non-empty — if so, skip those files. Implement each module with complete, functional code — no stubs, no placeholders, no TODO comments. Follow the plan and spec precisely.\n\nImplementation order (core infrastructure first, then features, then tests/deploy):\n\n1. Project scaffold — package.json, vite.config.ts, tsconfig.json, index.html, src/main.tsx, src/App.tsx. Install dependencies: react, react-dom, react-router-dom, idb, @google/generative-ai. Write ALL validation scripts:\n - scripts/validate-build.sh: runs npm run build, checks dist/ exists\n - scripts/validate-fmt.sh: runs npx prettier --check src/\n - scripts/validate-test.sh: runs integration scenarios first, then smoke, writes evidence + .ai/test-evidence/latest/manifest.json even on failure\n - scripts/validate-browser.sh: runs browser verification and captures artifacts\n - scripts/fix-fmt.sh: runs npx prettier --write src/\n - scripts/validate-artifacts.sh: verifies manifest scenario IDs match DoD integration scenarios\n All scripts: #!/bin/sh, set -e, POSIX sh failure trap.\n\n2. LLM client — src/lib/llm.ts, src/lib/llm-schemas.ts: structured JSON output with schema enforcement, retry with error feedback and intelligent backoff, model switching (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side API key.\n\n3. Persistence — src/lib/db.ts, src/lib/types.ts: IndexedDB via idb with stores for configuration (API key, company, voice, guardrails), drafts, sessions (all inputs/LLM responses/intermediate state), post history (Markdown + attribution). All data persists unless user resets.\n\n4. Shared UI — src/components/RichInput.tsx, Card.tsx, ProgressBar.tsx, StepIndicator.tsx, src/styles/global.css.\n\n5. Setup flow — src/pages/Settings.tsx and step components: API key, company (rich input + gemini-3.1-pro-preview confirm + back), voice, guardrails. Parallel completion, status icons. Reset everything with confirmation.\n\n6. Dashboard — src/pages/Dashboard.tsx: New Post button, Trending Topics button, Settings link, post history, draft resume.\n\n7. Trending Topics — src/pages/TrendingTopics.tsx: parallel gemini-3-flash-preview search grounding queries, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post prefilled.\n\n8. New Post — src/pages/NewPost.tsx with step components: Topic (rich input), Research (gemini-3-flash-preview search grounding, source cards with URL/title/author/date, highlight/delete), Outline (gemini-3.1-pro-preview one-shot, accept/back), Write (3 automatic gemini-3.1-pro-preview cycles: Write with citations, Edit for style, Guardrails-only), Complete (serif post with numbered footnotes, linked sources, attribution lineage).\n\n9. Demo mode — src/lib/demo.ts, src/pages/DemoMode.tsx, src/demo/bundled-session.json: session recording, replay through production path (fade-in prefills, highlight next button), bundled P&G session, cache-miss error with no API fallback.\n\n10. Test infrastructure — src/__tests__/: integration with canned data, smoke with gemini-2.5-flash-lite, manual mode option.\n\n11. Deploy config — railway.json or equivalent, deployment docs.\n\nEnsure App.tsx routing includes all pages. Verify imports/exports are consistent. Run npm install. Fix TypeScript errors.\n\nLog progress to .ai/runs/$KILROY_RUN_ID/implementation_log.md.\n\nPRE-EXIT VERIFICATION: if .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, run sh scripts/validate-build.sh and re-read targeted files to confirm fixes.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH) ONLY on failure: status=fail with failure_reason, failure_class, and failure_signature=implement_fail." + prompt="Goal: $goal\n\nRead .ai/runs/$KILROY_RUN_ID/plan_final.md, .ai/runs/$KILROY_RUN_ID/spec.md, and .ai/runs/$KILROY_RUN_ID/definition_of_done.md. If the spec or DoD files do not exist at those paths, fall back to reading substack-spec-v01.md and substack-dod-v01.md directly.\n\nBEFORE ANYTHING ELSE: check if .ai/runs/$KILROY_RUN_ID/verify_errors.log exists. If it does, read it — it contains the exact commands that failed and their error output from the verify chain. Fix every error listed in that file, then delete .ai/runs/$KILROY_RUN_ID/verify_errors.log when all fixes are applied. Do NOT regenerate working code — only fix the specific errors.\n\nAlso check if .ai/runs/$KILROY_RUN_ID/verify_fidelity.md exists. If it does, read it — it contains per-AC pass/fail verdicts from the fidelity check. Fix every failing AC listed in that file.\n\nIf .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, read it and fix ONLY identified gaps — do NOT regenerate working code. On repair passes, read and fix existing files rather than skipping them.\n\nImplement the complete Substack Creator Newsletter Engine as a single pass. On a fresh pass (no postmortem), check if target files already exist on disk and are non-empty — if so, skip those files. Implement each module with complete, functional code — no stubs, no placeholders, no TODO comments. Follow the plan and spec precisely.\n\nImplementation order (core infrastructure first, then features, then tests/deploy):\n\n1. Project scaffold — package.json, vite.config.ts, tsconfig.json, index.html, src/main.tsx, src/App.tsx. Install dependencies: react, react-dom, react-router-dom, idb, @google/generative-ai. Write ALL validation scripts:\n - scripts/validate-build.sh: runs npm run build, checks dist/ exists\n - scripts/validate-fmt.sh: runs npx prettier --check src/\n - scripts/validate-test.sh: runs integration scenarios first, then smoke, writes evidence + .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json even on failure\n - scripts/validate-browser.sh: runs browser verification and captures artifacts\n - scripts/fix-fmt.sh: runs npx prettier --write src/\n - scripts/validate-artifacts.sh: verifies manifest scenario IDs match DoD integration scenarios\n All scripts: #!/bin/sh, set -e, POSIX sh failure trap.\n\n2. LLM client — src/lib/llm.ts, src/lib/llm-schemas.ts: structured JSON output with schema enforcement, retry with error feedback and intelligent backoff, model switching (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side API key.\n\n3. Persistence — src/lib/db.ts, src/lib/types.ts: IndexedDB via idb with stores for configuration (API key, company, voice, guardrails), drafts, sessions (all inputs/LLM responses/intermediate state), post history (Markdown + attribution). All data persists unless user resets.\n\n4. Shared UI — src/components/RichInput.tsx, Card.tsx, ProgressBar.tsx, StepIndicator.tsx, src/styles/global.css.\n\n5. Setup flow — src/pages/Settings.tsx and step components: API key, company (rich input + gemini-3.1-pro-preview confirm + back), voice, guardrails. Parallel completion, status icons. Reset everything with confirmation.\n\n6. Dashboard — src/pages/Dashboard.tsx: New Post button, Trending Topics button, Settings link, post history, draft resume.\n\n7. Trending Topics — src/pages/TrendingTopics.tsx: parallel gemini-3-flash-preview search grounding queries, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post prefilled.\n\n8. New Post — src/pages/NewPost.tsx with step components: Topic (rich input), Research (gemini-3-flash-preview search grounding, source cards with URL/title/author/date, highlight/delete), Outline (gemini-3.1-pro-preview one-shot, accept/back), Write (3 automatic gemini-3.1-pro-preview cycles: Write with citations, Edit for style, Guardrails-only), Complete (serif post with numbered footnotes, linked sources, attribution lineage).\n\n9. Demo mode — src/lib/demo.ts, src/pages/DemoMode.tsx, src/demo/bundled-session.json: session recording, replay through production path (fade-in prefills, highlight next button), bundled P&G session, cache-miss error with no API fallback.\n\n10. Test infrastructure — src/__tests__/: integration with canned data, smoke with gemini-2.5-flash-lite, manual mode option.\n\n11. Deploy config — railway.json or equivalent, deployment docs.\n\nEnsure App.tsx routing includes all pages. Verify imports/exports are consistent. Run npm install. Fix TypeScript errors.\n\nLog progress to .ai/runs/$KILROY_RUN_ID/implementation_log.md.\n\nPRE-EXIT VERIFICATION: if .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, run sh scripts/validate-build.sh and re-read targeted files to confirm fixes.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH) ONLY on failure: status=fail with failure_reason, failure_class, and failure_signature=implement_fail." ] } @@ -95,46 +95,46 @@ digraph substack_spec_v01 { fix_fmt [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/fix-fmt.sh 2>&1 || { printf '\\n=== VERIFY FAILURE: sh scripts/fix-fmt.sh ===\\n%s\\n' \"$(cat /tmp/fix-fmt.log 2>/dev/null || echo 'script missing or produced no output')\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/fix-fmt.sh 2>&1 || { printf '\\n=== VERIFY FAILURE: sh scripts/fix-fmt.sh ===\\n%s\\n' \"$(cat /tmp/fix-fmt.log 2>/dev/null || echo 'script missing or produced no output')\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] verify_fmt [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/validate-fmt.sh 2>&1 | tee /tmp/validate-fmt.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-fmt.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-fmt.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-fmt.sh 2>&1 | tee /tmp/validate-fmt.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-fmt.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-fmt.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_fmt [shape=diamond, label="Fmt OK?"] verify_build [ shape=parallelogram, - tool_command="sh scripts/validate-build.sh 2>&1 | tee /tmp/validate-build.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-build.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-build.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-build.sh 2>&1 | tee /tmp/validate-build.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-build.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-build.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_build [shape=diamond, label="Build OK?"] verify_test [ shape=parallelogram, - tool_command="sh scripts/validate-test.sh 2>&1 | tee /tmp/validate-test.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-test.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-test.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-test.sh 2>&1 | tee /tmp/validate-test.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-test.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-test.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_test [shape=diamond, label="Tests OK?"] verify_browser [ shape=parallelogram, collect_browser_artifacts=true, - tool_command="sh scripts/validate-browser.sh 2>&1 | tee /tmp/validate-browser.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-browser.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-browser.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-browser.sh 2>&1 | tee /tmp/validate-browser.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-browser.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-browser.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_browser [shape=diamond, label="Browser OK?"] verify_artifacts [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/validate-artifacts.sh 2>&1 | tee /tmp/validate-artifacts.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-artifacts.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-artifacts.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-artifacts.sh 2>&1 | tee /tmp/validate-artifacts.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-artifacts.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-artifacts.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_artifacts [shape=diamond, label="Artifacts OK?"] verify_fidelity [ shape=box, class="verify", - prompt="Read .ai/runs/$KILROY_RUN_ID/spec.md, .ai/runs/$KILROY_RUN_ID/definition_of_done.md, .ai/runs/$KILROY_RUN_ID/verify_fidelity.md (if present), .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json, and relevant implementation files.\n\nEvaluate these grouped acceptance checks and map each to concrete file paths:\nAC1: src/**/settings* and src/**/router* and src/**/indexeddb* - first-run routing, setup flow, persistence.\nAC2: src/**/dashboard* and src/**/history* and src/**/draft* - dashboard actions and resume/view flows.\nAC3: src/**/trending* and src/**/research* - grounded research, deterministic trends, prompt handoff.\nAC4: src/**/new-post* and src/**/outline* and src/**/write* and src/**/complete* - full Topic->Complete pipeline with automatic write cycles.\nAC5: src/**/citation* and src/**/markdown* - citation lineage, footnote rendering, attribution persistence.\nAC6: src/**/demo* and src/demo/** - session picker/replay, bundled P&G demo, cache-miss no-fallback behavior.\nAC7: src/**/llm* and src/**/schema* - structured outputs, retry/backoff, production/test model intent.\nAC8: scripts/validate-build.sh and railway.json/Procfile/README* - build/deploy-readiness by static review only, no live deployment execution.\nAC9: scripts/validate-test.sh and scripts/validate-browser.sh and test sources - integration before smoke, manual mode option, browser evidence capture.\nAC10: .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/** - IT-1..IT-12 manifest coverage and required artifact types.\nAC11: src/**/style* and src/**/card* and src/**/progress* - Substack-like visual contract including bars, cards, and serif post preview.\n\nWrite .ai/runs/$KILROY_RUN_ID/verify_fidelity.md with pass/fail verdict and evidence per AC1..AC11.\n\nOn ANY failure: also append to .ai/verify_errors.log with the header '=== VERIFY FAILURE: verify_fidelity ===' followed by the list of failing ACs and their specific issues, so the implement node can read the consolidated error log.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH):\nAll pass: status=success.\nAny fail: status=fail with failure_reason listing failing ACs, failure_class=deterministic_agent_bug, failure_signature as sorted comma-separated failing AC IDs (e.g. AC1,AC10)." + prompt="Read .ai/runs/$KILROY_RUN_ID/spec.md, .ai/runs/$KILROY_RUN_ID/definition_of_done.md, .ai/runs/$KILROY_RUN_ID/verify_fidelity.md (if present), .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json, and relevant implementation files.\n\nEvaluate these grouped acceptance checks and map each to concrete file paths:\nAC1: src/**/settings* and src/**/router* and src/**/indexeddb* - first-run routing, setup flow, persistence.\nAC2: src/**/dashboard* and src/**/history* and src/**/draft* - dashboard actions and resume/view flows.\nAC3: src/**/trending* and src/**/research* - grounded research, deterministic trends, prompt handoff.\nAC4: src/**/new-post* and src/**/outline* and src/**/write* and src/**/complete* - full Topic->Complete pipeline with automatic write cycles.\nAC5: src/**/citation* and src/**/markdown* - citation lineage, footnote rendering, attribution persistence.\nAC6: src/**/demo* and src/demo/** - session picker/replay, bundled P&G demo, cache-miss no-fallback behavior.\nAC7: src/**/llm* and src/**/schema* - structured outputs, retry/backoff, production/test model intent.\nAC8: scripts/validate-build.sh and railway.json/Procfile/README* - build/deploy-readiness by static review only, no live deployment execution.\nAC9: scripts/validate-test.sh and scripts/validate-browser.sh and test sources - integration before smoke, manual mode option, browser evidence capture.\nAC10: .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/** - IT-1..IT-12 manifest coverage and required artifact types.\nAC11: src/**/style* and src/**/card* and src/**/progress* - Substack-like visual contract including bars, cards, and serif post preview.\n\nWrite .ai/runs/$KILROY_RUN_ID/verify_fidelity.md with pass/fail verdict and evidence per AC1..AC11.\n\nOn ANY failure: also append to .ai/runs/$KILROY_RUN_ID/verify_errors.log with the header '=== VERIFY FAILURE: verify_fidelity ===' followed by the list of failing ACs and their specific issues, so the implement node can read the consolidated error log.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH):\nAll pass: status=success.\nAny fail: status=fail with failure_reason listing failing ACs, failure_class=deterministic_agent_bug, failure_signature as sorted comma-separated failing AC IDs (e.g. AC1,AC10)." ] check_impl [shape=diamond, label="Impl OK?"] } @@ -151,12 +151,12 @@ digraph substack_spec_v01 { review_a [ class="branch-a", - prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4567 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4567/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_a.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." + prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4567 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4567/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/runs/$KILROY_RUN_ID/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_a.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." ] review_b [ class="branch-b", - prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4568 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4568/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_b.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." + prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4568 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4568/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/runs/$KILROY_RUN_ID/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_b.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." ] review_consensus [ @@ -204,7 +204,7 @@ digraph substack_spec_v01 { // Implement -> Verify chain implement -> fix_fmt - // Verify chain — failures go directly back to implement (errors logged to .ai/verify_errors.log) + // Verify chain — failures go directly back to implement (errors logged to .ai/runs/$KILROY_RUN_ID/verify_errors.log) fix_fmt -> verify_fmt verify_fmt -> check_fmt check_fmt -> verify_build [condition="outcome=success"] diff --git a/internal/attractor/engine/cli_only_models.go b/internal/attractor/engine/cli_only_models.go index 2ff9de6a..2feb2bff 100644 --- a/internal/attractor/engine/cli_only_models.go +++ b/internal/attractor/engine/cli_only_models.go @@ -5,7 +5,8 @@ import "strings" // cliOnlyModelIDs lists models that MUST route through CLI backend regardless // of provider backend configuration. These models have no API endpoint. var cliOnlyModelIDs = map[string]bool{ - "gpt-5.4-spark": true, + "gpt-5.3-codex-spark": true, + "gpt-5.4-spark": true, } // isCLIOnlyModel returns true if the given model ID (with or without provider diff --git a/internal/attractor/engine/cli_only_models_test.go b/internal/attractor/engine/cli_only_models_test.go index 316758fb..3b6d02f1 100644 --- a/internal/attractor/engine/cli_only_models_test.go +++ b/internal/attractor/engine/cli_only_models_test.go @@ -8,7 +8,7 @@ func TestIsCLIOnlyModel(t *testing.T) { want bool }{ {"gpt-5.4-spark", true}, - {"GPT-5.3-CODEX-SPARK", true}, // case-insensitive + {"GPT-5.3-CODEX-SPARK", true}, // case-insensitive {"openai/gpt-5.4-spark", true}, // with provider prefix {"gpt-5.4", false}, // regular codex {"gpt-5.4", false}, diff --git a/internal/attractor/engine/engine.go b/internal/attractor/engine/engine.go index 43aa0a2d..2eba720f 100644 --- a/internal/attractor/engine/engine.go +++ b/internal/attractor/engine/engine.go @@ -1223,7 +1223,7 @@ func (e *Engine) executeWithRetry(ctx context.Context, node *model.Node, retries _ = writeJSON(filepath.Join(stageDir, "status.json"), fo) return fo, nil } - if out.Status == runtime.StatusSuccess || out.Status == runtime.StatusPartialSuccess || out.Status == runtime.StatusSkipped { + if out.Status == runtime.StatusSuccess || out.Status == runtime.StatusDegradedSuccess || out.Status == runtime.StatusPartialSuccess || out.Status == runtime.StatusSkipped { retries[node.ID] = 0 return out, nil } @@ -1957,7 +1957,7 @@ func checkGoalGates(g *model.Graph, outcomes map[string]runtime.Outcome) (bool, if !strings.EqualFold(n.Attr("goal_gate", "false"), "true") { continue } - if out.Status != runtime.StatusSuccess && out.Status != runtime.StatusPartialSuccess { + if out.Status != runtime.StatusSuccess && out.Status != runtime.StatusDegradedSuccess && out.Status != runtime.StatusPartialSuccess { return false, id } } diff --git a/internal/attractor/engine/prompts/stage_status_contract_preamble.tmpl b/internal/attractor/engine/prompts/stage_status_contract_preamble.tmpl index 5522ce6b..eb48bef4 100644 --- a/internal/attractor/engine/prompts/stage_status_contract_preamble.tmpl +++ b/internal/attractor/engine/prompts/stage_status_contract_preamble.tmpl @@ -4,3 +4,6 @@ ${{.StageStatusFallbackPathEnvKey}} = {{.FallbackPath}} - Write your status JSON to ${{.StageStatusPathEnvKey}}. If that write fails, use run-scoped fallback ${{.StageStatusFallbackPathEnvKey}}. - Do not write status.json to nested module directories. - To end this stage: write the status file, then return your response. Do NOT call close_agent or any session-management tool. +- Verification reporting: if you ran verification commands (tests, linters, type checks) and any failed or were blocked by infra issues (missing tools, DNS errors, network failures), you MUST set status to "degraded_success" instead of "success" and include a "verification" object: + {"status": "degraded_success", "verification": {"status": "blocked", "blocked_reason": "npm registry DNS failure", "commands": [{"command": "npx tsc", "exit_code": 1, "blocked": true, "reason": "EAI_AGAIN"}]}, ...} +- Use "success" only when all verification commands actually executed and passed, or when no verification was needed. diff --git a/internal/attractor/runtime/status.go b/internal/attractor/runtime/status.go index a33f1f92..ef479bce 100644 --- a/internal/attractor/runtime/status.go +++ b/internal/attractor/runtime/status.go @@ -9,17 +9,20 @@ import ( type StageStatus string const ( - StatusSuccess StageStatus = "success" - StatusPartialSuccess StageStatus = "partial_success" - StatusRetry StageStatus = "retry" - StatusFail StageStatus = "fail" - StatusSkipped StageStatus = "skipped" + StatusSuccess StageStatus = "success" + StatusDegradedSuccess StageStatus = "degraded_success" + StatusPartialSuccess StageStatus = "partial_success" + StatusRetry StageStatus = "retry" + StatusFail StageStatus = "fail" + StatusSkipped StageStatus = "skipped" ) func ParseStageStatus(s string) (StageStatus, error) { switch strings.ToLower(strings.TrimSpace(s)) { case "success", "ok": return StatusSuccess, nil + case "degraded_success", "degradedsuccess", "degraded-success": + return StatusDegradedSuccess, nil case "partial_success", "partialsuccess", "partial-success": return StatusPartialSuccess, nil case "retry": @@ -49,20 +52,36 @@ func (s StageStatus) Valid() bool { // (success, partial_success, retry, fail, skipped) rather than a custom routing value. func (s StageStatus) IsCanonical() bool { switch s { - case StatusSuccess, StatusPartialSuccess, StatusRetry, StatusFail, StatusSkipped: + case StatusSuccess, StatusDegradedSuccess, StatusPartialSuccess, StatusRetry, StatusFail, StatusSkipped: return true default: return false } } +// VerificationResult captures the outcome of verification commands run during a stage. +type VerificationResult struct { + Status string `json:"status"` // "passed", "failed", or "blocked" + BlockedReason string `json:"blocked_reason,omitempty"` // why verification could not run + Commands []VerificationEntry `json:"commands,omitempty"` // individual command results +} + +// VerificationEntry records the result of a single verification command. +type VerificationEntry struct { + Command string `json:"command"` + ExitCode int `json:"exit_code"` + Blocked bool `json:"blocked,omitempty"` + Reason string `json:"reason,omitempty"` +} + type Outcome struct { - Status StageStatus `json:"status"` - PreferredLabel string `json:"preferred_label,omitempty"` - SuggestedNextIDs []string `json:"suggested_next_ids,omitempty"` - ContextUpdates map[string]any `json:"context_updates,omitempty"` - Notes string `json:"notes,omitempty"` - FailureReason string `json:"failure_reason,omitempty"` + Status StageStatus `json:"status"` + PreferredLabel string `json:"preferred_label,omitempty"` + SuggestedNextIDs []string `json:"suggested_next_ids,omitempty"` + ContextUpdates map[string]any `json:"context_updates,omitempty"` + Notes string `json:"notes,omitempty"` + FailureReason string `json:"failure_reason,omitempty"` + Verification *VerificationResult `json:"verification,omitempty"` // Details is optional structured information for failures (or for debugging). // The engine does not use it for routing, but it must be preserved when present. Details any `json:"details,omitempty"` diff --git a/internal/attractor/runtime/status_test.go b/internal/attractor/runtime/status_test.go index b62e4d1c..7c6e7e30 100644 --- a/internal/attractor/runtime/status_test.go +++ b/internal/attractor/runtime/status_test.go @@ -11,12 +11,15 @@ func TestParseStageStatus_CanonicalAndLegacy(t *testing.T) { want StageStatus }{ {"success", StatusSuccess}, + {"degraded_success", StatusDegradedSuccess}, {"partial_success", StatusPartialSuccess}, {"retry", StatusRetry}, {"fail", StatusFail}, {"skipped", StatusSkipped}, // Compatibility aliases. {"ok", StatusSuccess}, + {"degraded-success", StatusDegradedSuccess}, + {"degradedsuccess", StatusDegradedSuccess}, {"error", StatusFail}, {"SUCCESS", StatusSuccess}, {"FAIL", StatusFail}, @@ -68,6 +71,9 @@ func TestStageStatus_IsCanonical(t *testing.T) { if !StatusSuccess.IsCanonical() { t.Fatalf("StatusSuccess should be canonical") } + if !StatusDegradedSuccess.IsCanonical() { + t.Fatalf("StatusDegradedSuccess should be canonical") + } if !StatusFail.IsCanonical() { t.Fatalf("StatusFail should be canonical") } @@ -164,6 +170,72 @@ func TestDecodeOutcomeJSON_LegacyFailDetails_PopulatesFailureReason(t *testing.T } } +func TestDecodeOutcomeJSON_DegradedSuccessWithVerification(t *testing.T) { + input := `{ + "status": "degraded_success", + "notes": "implementation complete but tsc blocked by DNS failure", + "verification": { + "status": "blocked", + "blocked_reason": "npm registry DNS failure", + "commands": [ + {"command": "npx tsc", "exit_code": 1, "blocked": true, "reason": "EAI_AGAIN"} + ] + } + }` + o, err := DecodeOutcomeJSON([]byte(input)) + if err != nil { + t.Fatalf("DecodeOutcomeJSON: %v", err) + } + if o.Status != StatusDegradedSuccess { + t.Fatalf("status: got %q want %q", o.Status, StatusDegradedSuccess) + } + if o.Verification == nil { + t.Fatal("expected non-nil verification") + } + if o.Verification.Status != "blocked" { + t.Fatalf("verification status: got %q want %q", o.Verification.Status, "blocked") + } + if o.Verification.BlockedReason != "npm registry DNS failure" { + t.Fatalf("verification blocked_reason: got %q", o.Verification.BlockedReason) + } + if len(o.Verification.Commands) != 1 { + t.Fatalf("expected 1 verification command, got %d", len(o.Verification.Commands)) + } + cmd := o.Verification.Commands[0] + if cmd.Command != "npx tsc" || cmd.ExitCode != 1 || !cmd.Blocked { + t.Fatalf("unexpected verification command: %+v", cmd) + } +} + +func TestDecodeOutcomeJSON_SuccessWithPassedVerification(t *testing.T) { + input := `{ + "status": "success", + "verification": { + "status": "passed", + "commands": [ + {"command": "go test ./...", "exit_code": 0}, + {"command": "go vet ./...", "exit_code": 0} + ] + } + }` + o, err := DecodeOutcomeJSON([]byte(input)) + if err != nil { + t.Fatalf("DecodeOutcomeJSON: %v", err) + } + if o.Status != StatusSuccess { + t.Fatalf("status: got %q want %q", o.Status, StatusSuccess) + } + if o.Verification == nil { + t.Fatal("expected non-nil verification") + } + if o.Verification.Status != "passed" { + t.Fatalf("verification status: got %q want %q", o.Verification.Status, "passed") + } + if len(o.Verification.Commands) != 2 { + t.Fatalf("expected 2 verification commands, got %d", len(o.Verification.Commands)) + } +} + func TestDecodeOutcomeJSON_LegacyRetryDetails_PopulatesFailureReason(t *testing.T) { o, err := DecodeOutcomeJSON([]byte(`{"outcome":"retry","details":"transient timeout"}`)) if err != nil {