diff --git a/demo/substack-spec-v01.dot b/demo/substack-spec-v01.dot index 486eb721..1bcc4e10 100644 --- a/demo/substack-spec-v01.dot +++ b/demo/substack-spec-v01.dot @@ -81,7 +81,7 @@ digraph substack_spec_v01 { max_agent_turns=300, max_tokens=32768, label="Implement", - prompt="Goal: $goal\n\nRead .ai/runs/$KILROY_RUN_ID/plan_final.md, .ai/runs/$KILROY_RUN_ID/spec.md, and .ai/runs/$KILROY_RUN_ID/definition_of_done.md. If the spec or DoD files do not exist at those paths, fall back to reading substack-spec-v01.md and substack-dod-v01.md directly.\n\nBEFORE ANYTHING ELSE: check if .ai/verify_errors.log exists. If it does, read it — it contains the exact commands that failed and their error output from the verify chain. Fix every error listed in that file, then delete .ai/verify_errors.log when all fixes are applied. Do NOT regenerate working code — only fix the specific errors.\n\nAlso check if .ai/runs/$KILROY_RUN_ID/verify_fidelity.md exists. If it does, read it — it contains per-AC pass/fail verdicts from the fidelity check. Fix every failing AC listed in that file.\n\nIf .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, read it and fix ONLY identified gaps — do NOT regenerate working code. On repair passes, read and fix existing files rather than skipping them.\n\nImplement the complete Substack Creator Newsletter Engine as a single pass. On a fresh pass (no postmortem), check if target files already exist on disk and are non-empty — if so, skip those files. Implement each module with complete, functional code — no stubs, no placeholders, no TODO comments. Follow the plan and spec precisely.\n\nImplementation order (core infrastructure first, then features, then tests/deploy):\n\n1. Project scaffold — package.json, vite.config.ts, tsconfig.json, index.html, src/main.tsx, src/App.tsx. Install dependencies: react, react-dom, react-router-dom, idb, @google/generative-ai. Write ALL validation scripts:\n - scripts/validate-build.sh: runs npm run build, checks dist/ exists\n - scripts/validate-fmt.sh: runs npx prettier --check src/\n - scripts/validate-test.sh: runs integration scenarios first, then smoke, writes evidence + .ai/test-evidence/latest/manifest.json even on failure\n - scripts/validate-browser.sh: runs browser verification and captures artifacts\n - scripts/fix-fmt.sh: runs npx prettier --write src/\n - scripts/validate-artifacts.sh: verifies manifest scenario IDs match DoD integration scenarios\n All scripts: #!/bin/sh, set -e, POSIX sh failure trap.\n\n2. LLM client — src/lib/llm.ts, src/lib/llm-schemas.ts: structured JSON output with schema enforcement, retry with error feedback and intelligent backoff, model switching (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side API key.\n\n3. Persistence — src/lib/db.ts, src/lib/types.ts: IndexedDB via idb with stores for configuration (API key, company, voice, guardrails), drafts, sessions (all inputs/LLM responses/intermediate state), post history (Markdown + attribution). All data persists unless user resets.\n\n4. Shared UI — src/components/RichInput.tsx, Card.tsx, ProgressBar.tsx, StepIndicator.tsx, src/styles/global.css.\n\n5. Setup flow — src/pages/Settings.tsx and step components: API key, company (rich input + gemini-3.1-pro-preview confirm + back), voice, guardrails. Parallel completion, status icons. Reset everything with confirmation.\n\n6. Dashboard — src/pages/Dashboard.tsx: New Post button, Trending Topics button, Settings link, post history, draft resume.\n\n7. Trending Topics — src/pages/TrendingTopics.tsx: parallel gemini-3-flash-preview search grounding queries, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post prefilled.\n\n8. New Post — src/pages/NewPost.tsx with step components: Topic (rich input), Research (gemini-3-flash-preview search grounding, source cards with URL/title/author/date, highlight/delete), Outline (gemini-3.1-pro-preview one-shot, accept/back), Write (3 automatic gemini-3.1-pro-preview cycles: Write with citations, Edit for style, Guardrails-only), Complete (serif post with numbered footnotes, linked sources, attribution lineage).\n\n9. Demo mode — src/lib/demo.ts, src/pages/DemoMode.tsx, src/demo/bundled-session.json: session recording, replay through production path (fade-in prefills, highlight next button), bundled P&G session, cache-miss error with no API fallback.\n\n10. Test infrastructure — src/__tests__/: integration with canned data, smoke with gemini-2.5-flash-lite, manual mode option.\n\n11. Deploy config — railway.json or equivalent, deployment docs.\n\nEnsure App.tsx routing includes all pages. Verify imports/exports are consistent. Run npm install. Fix TypeScript errors.\n\nLog progress to .ai/runs/$KILROY_RUN_ID/implementation_log.md.\n\nPRE-EXIT VERIFICATION: if .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, run sh scripts/validate-build.sh and re-read targeted files to confirm fixes.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH) ONLY on failure: status=fail with failure_reason, failure_class, and failure_signature=implement_fail." + prompt="Goal: $goal\n\nRead .ai/runs/$KILROY_RUN_ID/plan_final.md, .ai/runs/$KILROY_RUN_ID/spec.md, and .ai/runs/$KILROY_RUN_ID/definition_of_done.md. If the spec or DoD files do not exist at those paths, fall back to reading substack-spec-v01.md and substack-dod-v01.md directly.\n\nBEFORE ANYTHING ELSE: check if .ai/runs/$KILROY_RUN_ID/verify_errors.log exists. If it does, read it — it contains the exact commands that failed and their error output from the verify chain. Fix every error listed in that file, then delete .ai/runs/$KILROY_RUN_ID/verify_errors.log when all fixes are applied. Do NOT regenerate working code — only fix the specific errors.\n\nAlso check if .ai/runs/$KILROY_RUN_ID/verify_fidelity.md exists. If it does, read it — it contains per-AC pass/fail verdicts from the fidelity check. Fix every failing AC listed in that file.\n\nIf .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, read it and fix ONLY identified gaps — do NOT regenerate working code. On repair passes, read and fix existing files rather than skipping them.\n\nImplement the complete Substack Creator Newsletter Engine as a single pass. On a fresh pass (no postmortem), check if target files already exist on disk and are non-empty — if so, skip those files. Implement each module with complete, functional code — no stubs, no placeholders, no TODO comments. Follow the plan and spec precisely.\n\nImplementation order (core infrastructure first, then features, then tests/deploy):\n\n1. Project scaffold — package.json, vite.config.ts, tsconfig.json, index.html, src/main.tsx, src/App.tsx. Install dependencies: react, react-dom, react-router-dom, idb, @google/generative-ai. Write ALL validation scripts:\n - scripts/validate-build.sh: runs npm run build, checks dist/ exists\n - scripts/validate-fmt.sh: runs npx prettier --check src/\n - scripts/validate-test.sh: runs integration scenarios first, then smoke, writes evidence + .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json even on failure\n - scripts/validate-browser.sh: runs browser verification and captures artifacts\n - scripts/fix-fmt.sh: runs npx prettier --write src/\n - scripts/validate-artifacts.sh: verifies manifest scenario IDs match DoD integration scenarios\n All scripts: #!/bin/sh, set -e, POSIX sh failure trap.\n\n2. LLM client — src/lib/llm.ts, src/lib/llm-schemas.ts: structured JSON output with schema enforcement, retry with error feedback and intelligent backoff, model switching (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side API key.\n\n3. Persistence — src/lib/db.ts, src/lib/types.ts: IndexedDB via idb with stores for configuration (API key, company, voice, guardrails), drafts, sessions (all inputs/LLM responses/intermediate state), post history (Markdown + attribution). All data persists unless user resets.\n\n4. Shared UI — src/components/RichInput.tsx, Card.tsx, ProgressBar.tsx, StepIndicator.tsx, src/styles/global.css.\n\n5. Setup flow — src/pages/Settings.tsx and step components: API key, company (rich input + gemini-3.1-pro-preview confirm + back), voice, guardrails. Parallel completion, status icons. Reset everything with confirmation.\n\n6. Dashboard — src/pages/Dashboard.tsx: New Post button, Trending Topics button, Settings link, post history, draft resume.\n\n7. Trending Topics — src/pages/TrendingTopics.tsx: parallel gemini-3-flash-preview search grounding queries, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post prefilled.\n\n8. New Post — src/pages/NewPost.tsx with step components: Topic (rich input), Research (gemini-3-flash-preview search grounding, source cards with URL/title/author/date, highlight/delete), Outline (gemini-3.1-pro-preview one-shot, accept/back), Write (3 automatic gemini-3.1-pro-preview cycles: Write with citations, Edit for style, Guardrails-only), Complete (serif post with numbered footnotes, linked sources, attribution lineage).\n\n9. Demo mode — src/lib/demo.ts, src/pages/DemoMode.tsx, src/demo/bundled-session.json: session recording, replay through production path (fade-in prefills, highlight next button), bundled P&G session, cache-miss error with no API fallback.\n\n10. Test infrastructure — src/__tests__/: integration with canned data, smoke with gemini-2.5-flash-lite, manual mode option.\n\n11. Deploy config — railway.json or equivalent, deployment docs.\n\nEnsure App.tsx routing includes all pages. Verify imports/exports are consistent. Run npm install. Fix TypeScript errors.\n\nLog progress to .ai/runs/$KILROY_RUN_ID/implementation_log.md.\n\nPRE-EXIT VERIFICATION: if .ai/runs/$KILROY_RUN_ID/postmortem_latest.md exists, run sh scripts/validate-build.sh and re-read targeted files to confirm fixes.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH) ONLY on failure: status=fail with failure_reason, failure_class, and failure_signature=implement_fail." ] } @@ -95,46 +95,46 @@ digraph substack_spec_v01 { fix_fmt [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/fix-fmt.sh 2>&1 || { printf '\\n=== VERIFY FAILURE: sh scripts/fix-fmt.sh ===\\n%s\\n' \"$(cat /tmp/fix-fmt.log 2>/dev/null || echo 'script missing or produced no output')\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/fix-fmt.sh 2>&1 || { printf '\\n=== VERIFY FAILURE: sh scripts/fix-fmt.sh ===\\n%s\\n' \"$(cat /tmp/fix-fmt.log 2>/dev/null || echo 'script missing or produced no output')\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] verify_fmt [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/validate-fmt.sh 2>&1 | tee /tmp/validate-fmt.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-fmt.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-fmt.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-fmt.sh 2>&1 | tee /tmp/validate-fmt.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-fmt.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-fmt.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_fmt [shape=diamond, label="Fmt OK?"] verify_build [ shape=parallelogram, - tool_command="sh scripts/validate-build.sh 2>&1 | tee /tmp/validate-build.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-build.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-build.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-build.sh 2>&1 | tee /tmp/validate-build.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-build.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-build.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_build [shape=diamond, label="Build OK?"] verify_test [ shape=parallelogram, - tool_command="sh scripts/validate-test.sh 2>&1 | tee /tmp/validate-test.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-test.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-test.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-test.sh 2>&1 | tee /tmp/validate-test.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-test.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-test.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_test [shape=diamond, label="Tests OK?"] verify_browser [ shape=parallelogram, collect_browser_artifacts=true, - tool_command="sh scripts/validate-browser.sh 2>&1 | tee /tmp/validate-browser.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-browser.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-browser.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-browser.sh 2>&1 | tee /tmp/validate-browser.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-browser.sh ===\\n%s\\n' \"$(tail -50 /tmp/validate-browser.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_browser [shape=diamond, label="Browser OK?"] verify_artifacts [ shape=parallelogram, max_retries=0, - tool_command="sh scripts/validate-artifacts.sh 2>&1 | tee /tmp/validate-artifacts.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-artifacts.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-artifacts.log)\" >> .ai/verify_errors.log; exit 1; }" + tool_command="sh scripts/validate-artifacts.sh 2>&1 | tee /tmp/validate-artifacts.log; test ${PIPESTATUS[0]} -eq 0 || { printf '\\n=== VERIFY FAILURE: sh scripts/validate-artifacts.sh ===\\n%s\\n' \"$(tail -30 /tmp/validate-artifacts.log)\" >> .ai/runs/$KILROY_RUN_ID/verify_errors.log; exit 1; }" ] check_artifacts [shape=diamond, label="Artifacts OK?"] verify_fidelity [ shape=box, class="verify", - prompt="Read .ai/runs/$KILROY_RUN_ID/spec.md, .ai/runs/$KILROY_RUN_ID/definition_of_done.md, .ai/runs/$KILROY_RUN_ID/verify_fidelity.md (if present), .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json, and relevant implementation files.\n\nEvaluate these grouped acceptance checks and map each to concrete file paths:\nAC1: src/**/settings* and src/**/router* and src/**/indexeddb* - first-run routing, setup flow, persistence.\nAC2: src/**/dashboard* and src/**/history* and src/**/draft* - dashboard actions and resume/view flows.\nAC3: src/**/trending* and src/**/research* - grounded research, deterministic trends, prompt handoff.\nAC4: src/**/new-post* and src/**/outline* and src/**/write* and src/**/complete* - full Topic->Complete pipeline with automatic write cycles.\nAC5: src/**/citation* and src/**/markdown* - citation lineage, footnote rendering, attribution persistence.\nAC6: src/**/demo* and src/demo/** - session picker/replay, bundled P&G demo, cache-miss no-fallback behavior.\nAC7: src/**/llm* and src/**/schema* - structured outputs, retry/backoff, production/test model intent.\nAC8: scripts/validate-build.sh and railway.json/Procfile/README* - build/deploy-readiness by static review only, no live deployment execution.\nAC9: scripts/validate-test.sh and scripts/validate-browser.sh and test sources - integration before smoke, manual mode option, browser evidence capture.\nAC10: .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/** - IT-1..IT-12 manifest coverage and required artifact types.\nAC11: src/**/style* and src/**/card* and src/**/progress* - Substack-like visual contract including bars, cards, and serif post preview.\n\nWrite .ai/runs/$KILROY_RUN_ID/verify_fidelity.md with pass/fail verdict and evidence per AC1..AC11.\n\nOn ANY failure: also append to .ai/verify_errors.log with the header '=== VERIFY FAILURE: verify_fidelity ===' followed by the list of failing ACs and their specific issues, so the implement node can read the consolidated error log.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH):\nAll pass: status=success.\nAny fail: status=fail with failure_reason listing failing ACs, failure_class=deterministic_agent_bug, failure_signature as sorted comma-separated failing AC IDs (e.g. AC1,AC10)." + prompt="Read .ai/runs/$KILROY_RUN_ID/spec.md, .ai/runs/$KILROY_RUN_ID/definition_of_done.md, .ai/runs/$KILROY_RUN_ID/verify_fidelity.md (if present), .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json, and relevant implementation files.\n\nEvaluate these grouped acceptance checks and map each to concrete file paths:\nAC1: src/**/settings* and src/**/router* and src/**/indexeddb* - first-run routing, setup flow, persistence.\nAC2: src/**/dashboard* and src/**/history* and src/**/draft* - dashboard actions and resume/view flows.\nAC3: src/**/trending* and src/**/research* - grounded research, deterministic trends, prompt handoff.\nAC4: src/**/new-post* and src/**/outline* and src/**/write* and src/**/complete* - full Topic->Complete pipeline with automatic write cycles.\nAC5: src/**/citation* and src/**/markdown* - citation lineage, footnote rendering, attribution persistence.\nAC6: src/**/demo* and src/demo/** - session picker/replay, bundled P&G demo, cache-miss no-fallback behavior.\nAC7: src/**/llm* and src/**/schema* - structured outputs, retry/backoff, production/test model intent.\nAC8: scripts/validate-build.sh and railway.json/Procfile/README* - build/deploy-readiness by static review only, no live deployment execution.\nAC9: scripts/validate-test.sh and scripts/validate-browser.sh and test sources - integration before smoke, manual mode option, browser evidence capture.\nAC10: .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/** - IT-1..IT-12 manifest coverage and required artifact types.\nAC11: src/**/style* and src/**/card* and src/**/progress* - Substack-like visual contract including bars, cards, and serif post preview.\n\nWrite .ai/runs/$KILROY_RUN_ID/verify_fidelity.md with pass/fail verdict and evidence per AC1..AC11.\n\nOn ANY failure: also append to .ai/runs/$KILROY_RUN_ID/verify_errors.log with the header '=== VERIFY FAILURE: verify_fidelity ===' followed by the list of failing ACs and their specific issues, so the implement node can read the consolidated error log.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH):\nAll pass: status=success.\nAny fail: status=fail with failure_reason listing failing ACs, failure_class=deterministic_agent_bug, failure_signature as sorted comma-separated failing AC IDs (e.g. AC1,AC10)." ] check_impl [shape=diamond, label="Impl OK?"] } @@ -151,12 +151,12 @@ digraph substack_spec_v01 { review_a [ class="branch-a", - prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4567 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4567/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_a.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." + prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4567 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4567/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/runs/$KILROY_RUN_ID/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_a.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." ] review_b [ class="branch-b", - prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4568 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4568/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_b.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." + prompt="Review the Substack Creator Newsletter Engine implementation against .ai/runs/$KILROY_RUN_ID/definition_of_done.md.\n\nRead the DoD for acceptance criteria. Read all implementation source files and .ai/runs/$KILROY_RUN_ID/test-evidence/latest/manifest.json.\n\n## MANDATORY: Browser verification\nYou MUST verify the app works in a real browser. Do not trust code reading alone.\n1. Run: npm run build (must exit 0)\n2. Start the preview server: npx vite preview --port 4568 &\n3. Wait 2 seconds, then use curl to fetch http://localhost:4568/ and verify it returns HTML with a root div\n4. Check that the HTML references JS and CSS bundles\n5. Kill the preview server when done\n6. Check browser artifacts in .ai/runs/$KILROY_RUN_ID/test-evidence/latest/ — screenshots must be real rendered pages (not 1x1 placeholders). If screenshot files are under 5KB, they are fake. REJECT.\n7. Check that playwright-report or equivalent browser test output exists and shows real test execution\n\nIf browser verification fails or artifacts are fake, REJECT immediately.\n\n## Code and AC verification\nCheck every AC group (AC1 through AC11):\n\nAC1: Build exits 0, static assets produced, deployment config present and coherent (review only, no live deploy)\nAC2: IndexedDB persistence for API key, config, posts, drafts, sessions, attribution mappings across reloads\nAC3: Structured JSON output with retry/backoff, correct model routing (gemini-3-flash-preview/gemini-3.1-pro-preview/gemini-2.5-flash-lite), client-side key\nAC4: Parallel setup (any order), status icons, rich input (text/upload/link), gemini-3.1-pro-preview confirmation, back button\nAC5: Dashboard with New Post and Trending Topics buttons, Settings link, post history, draft resume\nAC6: Trending Topics: parallel gemini-3-flash-preview search research, trend visualization, 3 gemini-3.1-pro-preview writing prompts, navigate to New Post\nAC7: Full post pipeline with source metadata, highlight/delete, one-shot outline, 3 automatic write cycles, citations with attribution lineage\nAC8: Demo replay through production path, fade-in/highlight, bundled P&G session, cache-miss error\nAC9: Validation scripts and runtime evidence contract for build/test/browser checks\nAC10: IT-1..IT-12 evidence manifest coverage and artifact completeness\nAC11: Substack visual design: serif fonts, step dots, card primitive, accent progress bars, no spinners, footnoted post\n\nVerdict: APPROVED (all criteria met with evidence) or REJECTED (specific gaps by AC ID).\nWrite to .ai/runs/$KILROY_RUN_ID/review_b.md.\n\nWrite to $KILROY_STAGE_STATUS_PATH (fallback: $KILROY_STAGE_STATUS_FALLBACK_PATH): On success: status=success. On failure: status=fail with failure_reason and failure_class." ] review_consensus [ @@ -204,7 +204,7 @@ digraph substack_spec_v01 { // Implement -> Verify chain implement -> fix_fmt - // Verify chain — failures go directly back to implement (errors logged to .ai/verify_errors.log) + // Verify chain — failures go directly back to implement (errors logged to .ai/runs/$KILROY_RUN_ID/verify_errors.log) fix_fmt -> verify_fmt verify_fmt -> check_fmt check_fmt -> verify_build [condition="outcome=success"] diff --git a/internal/attractor/engine/cli_only_models.go b/internal/attractor/engine/cli_only_models.go index 2ff9de6a..2feb2bff 100644 --- a/internal/attractor/engine/cli_only_models.go +++ b/internal/attractor/engine/cli_only_models.go @@ -5,7 +5,8 @@ import "strings" // cliOnlyModelIDs lists models that MUST route through CLI backend regardless // of provider backend configuration. These models have no API endpoint. var cliOnlyModelIDs = map[string]bool{ - "gpt-5.4-spark": true, + "gpt-5.3-codex-spark": true, + "gpt-5.4-spark": true, } // isCLIOnlyModel returns true if the given model ID (with or without provider diff --git a/internal/attractor/engine/cli_only_models_test.go b/internal/attractor/engine/cli_only_models_test.go index 316758fb..3b6d02f1 100644 --- a/internal/attractor/engine/cli_only_models_test.go +++ b/internal/attractor/engine/cli_only_models_test.go @@ -8,7 +8,7 @@ func TestIsCLIOnlyModel(t *testing.T) { want bool }{ {"gpt-5.4-spark", true}, - {"GPT-5.3-CODEX-SPARK", true}, // case-insensitive + {"GPT-5.3-CODEX-SPARK", true}, // case-insensitive {"openai/gpt-5.4-spark", true}, // with provider prefix {"gpt-5.4", false}, // regular codex {"gpt-5.4", false}, diff --git a/internal/attractor/engine/codergen_heartbeat_test.go b/internal/attractor/engine/codergen_heartbeat_test.go index 408e8a7e..531b1a62 100644 --- a/internal/attractor/engine/codergen_heartbeat_test.go +++ b/internal/attractor/engine/codergen_heartbeat_test.go @@ -350,7 +350,9 @@ digraph G { // TestRunWithConfig_APIBackend_StallWatchdogFiresDespiteHeartbeatGoroutine verifies // that the stall watchdog still fires when the API agent_loop session is truly // stalled (no new session events) even though the heartbeat goroutine is running. -// The conditional heartbeat should NOT emit progress when event_count is static. +// Heartbeat events are emitted unconditionally for observability but use +// appendProgressLivenessOnly when no new events are produced, which does not +// reset the stall watchdog timer. func TestRunWithConfig_APIBackend_StallWatchdogFiresDespiteHeartbeatGoroutine(t *testing.T) { repo := initTestRepo(t) logsRoot := t.TempDir() @@ -417,7 +419,9 @@ digraph G { // TestRunWithConfig_CLIBackend_StallWatchdogFiresDespiteHeartbeatGoroutine verifies // that the stall watchdog still fires when the CLI codergen process is truly // stalled (no stdout/stderr output) even though the heartbeat goroutine is running. -// The conditional heartbeat should NOT emit progress when file sizes are static. +// Heartbeat events are emitted unconditionally for observability but use +// appendProgressLivenessOnly when no output growth is detected, which does not +// reset the stall watchdog timer. func TestRunWithConfig_CLIBackend_StallWatchdogFiresDespiteHeartbeatGoroutine(t *testing.T) { repo := initTestRepo(t) logsRoot := t.TempDir() @@ -472,6 +476,95 @@ digraph G { t.Logf("stall watchdog fired as expected: %v", err) } +func TestRunWithConfig_HeartbeatEmitsDuringQuietPeriods(t *testing.T) { + repo := initTestRepo(t) + logsRoot := t.TempDir() + + pinned := writePinnedCatalog(t) + cxdbSrv := newCXDBTestServer(t) + + // Create a mock codex CLI that produces initial output, then goes quiet, + // then produces more output. The quiet period should still produce heartbeats. + cli := filepath.Join(t.TempDir(), "codex") + if err := os.WriteFile(cli, []byte(`#!/usr/bin/env bash +set -euo pipefail +echo '{"item":{"type":"message","role":"assistant","content":[{"type":"output_text","text":"starting"}]}}' >&1 +# Quiet period: no output for 3 seconds. +sleep 3 +echo '{"item":{"type":"message","role":"assistant","content":[{"type":"output_text","text":"done"}]}}' >&1 +`), 0o755); err != nil { + t.Fatal(err) + } + + t.Setenv("KILROY_CODERGEN_HEARTBEAT_INTERVAL", "1s") + t.Setenv("KILROY_CODEX_IDLE_TIMEOUT", "10s") + + cfg := &RunConfigFile{Version: 1} + cfg.Repo.Path = repo + cfg.CXDB.BinaryAddr = cxdbSrv.BinaryAddr() + cfg.CXDB.HTTPBaseURL = cxdbSrv.URL() + cfg.LLM.CLIProfile = "test_shim" + cfg.LLM.Providers = map[string]ProviderConfig{ + "openai": {Backend: BackendCLI, Executable: cli}, + } + cfg.ModelDB.OpenRouterModelInfoPath = pinned + cfg.ModelDB.OpenRouterModelInfoUpdatePolicy = "pinned" + cfg.Git.RunBranchPrefix = "attractor/run" + + dot := []byte(` +digraph G { + graph [goal="test quiet period heartbeats"] + start [shape=Mdiamond] + exit [shape=Msquare] + a [shape=box, llm_provider=openai, llm_model=gpt-5.4, prompt="do something quiet"] + start -> a -> exit +} +`) + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + res, err := RunWithConfig(ctx, dot, cfg, RunOptions{RunID: "quiet-heartbeat-test", LogsRoot: logsRoot, AllowTestShim: true}) + if err != nil { + t.Fatalf("RunWithConfig: %v", err) + } + + progressPath := filepath.Join(res.LogsRoot, "progress.ndjson") + data, err := os.ReadFile(progressPath) + if err != nil { + t.Fatalf("read progress.ndjson: %v", err) + } + + heartbeats := 0 + var hasQuietHeartbeat bool + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var ev map[string]any + if err := json.Unmarshal([]byte(line), &ev); err != nil { + continue + } + if ev["event"] == "stage_heartbeat" && ev["node_id"] == "a" { + heartbeats++ + if _, ok := ev["since_last_output_s"]; !ok { + t.Error("heartbeat missing since_last_output_s field") + } + sinceOutput, _ := ev["since_last_output_s"].(float64) + if sinceOutput >= 1 { + hasQuietHeartbeat = true + } + } + } + if heartbeats < 2 { + t.Fatalf("expected at least 2 heartbeat events (some during quiet period), got %d", heartbeats) + } + if !hasQuietHeartbeat { + t.Fatal("expected at least one heartbeat with since_last_output_s >= 1 (quiet period heartbeat)") + } + t.Logf("found %d heartbeat events, quiet period heartbeats present", heartbeats) +} + func TestRunWithConfig_HeartbeatStopsAfterProcessExit(t *testing.T) { events := runHeartbeatFixture(t) endIdx := findEventIndex(events, "stage_attempt_end", "a") diff --git a/internal/attractor/engine/codergen_router.go b/internal/attractor/engine/codergen_router.go index 769cc809..ee1b3a0b 100644 --- a/internal/attractor/engine/codergen_router.go +++ b/internal/attractor/engine/codergen_router.go @@ -328,21 +328,30 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m ticker := time.NewTicker(interval) defer ticker.Stop() var lastCount int + lastEventTime := time.Now() for { select { case <-ticker.C: eventsMu.Lock() count := len(events) eventsMu.Unlock() - if count > lastCount { + eventsGrew := count > lastCount + if eventsGrew { lastCount = count - if execCtx != nil && execCtx.Engine != nil { - execCtx.Engine.appendProgress(map[string]any{ - "event": "stage_heartbeat", - "node_id": node.ID, - "elapsed_s": int(time.Since(apiStart).Seconds()), - "event_count": count, - }) + lastEventTime = time.Now() + } + if execCtx != nil && execCtx.Engine != nil { + ev := map[string]any{ + "event": "stage_heartbeat", + "node_id": node.ID, + "elapsed_s": int(time.Since(apiStart).Seconds()), + "event_count": count, + "since_last_output_s": int(time.Since(lastEventTime).Seconds()), + } + if eventsGrew { + execCtx.Engine.appendProgress(ev) + } else { + execCtx.Engine.appendProgressLivenessOnly(ev) } } case <-heartbeatStop: @@ -1159,22 +1168,31 @@ func (r *CodergenRouter) runCLI(ctx context.Context, execCtx *Execution, node *m ticker := time.NewTicker(interval) defer ticker.Stop() var lastStdoutSz, lastStderrSz int64 + lastOutputTime := time.Now() for { select { case <-ticker.C: stdoutSz, _ := fileSize(stdoutPath) stderrSz, _ := fileSize(stderrPath) - if stdoutSz > lastStdoutSz || stderrSz > lastStderrSz { + outputGrew := stdoutSz > lastStdoutSz || stderrSz > lastStderrSz + if outputGrew { lastStdoutSz = stdoutSz lastStderrSz = stderrSz - if execCtx != nil && execCtx.Engine != nil { - execCtx.Engine.appendProgress(map[string]any{ - "event": "stage_heartbeat", - "node_id": node.ID, - "elapsed_s": int(time.Since(start).Seconds()), - "stdout_bytes": stdoutSz, - "stderr_bytes": stderrSz, - }) + lastOutputTime = time.Now() + } + if execCtx != nil && execCtx.Engine != nil { + ev := map[string]any{ + "event": "stage_heartbeat", + "node_id": node.ID, + "elapsed_s": int(time.Since(start).Seconds()), + "stdout_bytes": stdoutSz, + "stderr_bytes": stderrSz, + "since_last_output_s": int(time.Since(lastOutputTime).Seconds()), + } + if outputGrew { + execCtx.Engine.appendProgress(ev) + } else { + execCtx.Engine.appendProgressLivenessOnly(ev) } } case <-heartbeatStop: diff --git a/internal/attractor/engine/progress.go b/internal/attractor/engine/progress.go index ba25b9f1..07a5c636 100644 --- a/internal/attractor/engine/progress.go +++ b/internal/attractor/engine/progress.go @@ -17,6 +17,17 @@ import ( // // This is best-effort: progress logging must never block or fail a run. func (e *Engine) appendProgress(ev map[string]any) { + e.appendProgressImpl(ev, true) +} + +// appendProgressLivenessOnly writes a progress event to progress.ndjson and +// live.json for observability but does NOT reset the stall watchdog timer. +// Use this for unconditional heartbeat emissions that should not mask true stalls. +func (e *Engine) appendProgressLivenessOnly(ev map[string]any) { + e.appendProgressImpl(ev, false) +} + +func (e *Engine) appendProgressImpl(ev map[string]any, updateStallTimer bool) { if e == nil { return } @@ -47,7 +58,9 @@ func (e *Engine) appendProgress(ev map[string]any) { e.progressMu.Lock() defer e.progressMu.Unlock() - e.lastProgressAt = now + if updateStallTimer { + e.lastProgressAt = now + } // Append to progress.ndjson. // Intentionally open/close on each event so writes are immediately flushed