diff --git a/specs/github-comments.md b/specs/github-comments.md index 5a7824e2..4ab4e0e4 100644 --- a/specs/github-comments.md +++ b/specs/github-comments.md @@ -11,9 +11,11 @@ How Warden formats and manages PR comments: format conventions, deduplication, a One short sentence stating what breaks and why it matters. -
Verification +
Evidence -Evidence checked before reporting the finding. +- `createThing()` passes the caller-controlled value into `runThing()`. +- The adjacent write path checks `resolveThing()` before calling the same helper. +- This read path skips that guard, so access depends on the broader token scope.
@@ -29,7 +31,7 @@ Identified by Warden `skill-name` · `FINDING-ID` ``` - Title: bold, no emoji, no ID, no confidence, no severity -- Verification details render in a collapsible block when present +- Evidence details render in a collapsible block when present - Severity is communicated via GitHub check annotation level (failure/warning/notice) - Footer: skill name and finding ID in backticks - Additional locations in collapsible details block diff --git a/src/builtin-skills/code-review/SKILL.md b/src/builtin-skills/code-review/SKILL.md index 61f9b77f..2c24fbf3 100644 --- a/src/builtin-skills/code-review/SKILL.md +++ b/src/builtin-skills/code-review/SKILL.md @@ -83,5 +83,5 @@ No proof, no finding. Suspicion is not a result. - Title: name the exact bug and trigger. - Description: one short public comment stating the broken behavior and impact. Use a second sentence only if needed for the fix. -- `verification`: include trigger conditions, expected behavior, actual behavior, checked callers, guards, tests, schemas, or framework guarantees. +- `verification`: write a short evidence trace with concrete code facts showing the trigger, intended contract, changed behavior, and checks that fail to exclude it. Use 2-5 bullets when helpful. Do not use checklist labels or restate the description. - `suggestedFix`: include only when the fix is complete for the analyzed path. diff --git a/src/builtin-skills/security-review/SKILL.md b/src/builtin-skills/security-review/SKILL.md index eedc5203..3704cfb5 100644 --- a/src/builtin-skills/security-review/SKILL.md +++ b/src/builtin-skills/security-review/SKILL.md @@ -77,5 +77,5 @@ Load only matching references: - Title: name the vulnerability and impact. - Description: one short public comment stating the exploitable path and impact. Use a second sentence only if needed for the fix. -- `verification`: include source, sink or missing guard, boundary crossed, mitigations checked, and attacker-visible consequence. +- `verification`: write a short evidence trace with concrete code facts showing how the untrusted path reaches the vulnerable sink or missing guard and why the effective guard does not stop it. Use 2-5 bullets when helpful. Do not use checklist labels or restate the description. - `suggestedFix`: include only when the fix is complete for the analyzed file. diff --git a/src/output/renderer.test.ts b/src/output/renderer.test.ts index f6bb708e..763506de 100644 --- a/src/output/renderer.test.ts +++ b/src/output/renderer.test.ts @@ -104,7 +104,7 @@ describe('renderSkillReport', () => { expect(result.review!.comments[0]!.body).not.toContain('confidence'); }); - it('renders verification details in a collapsible section', () => { + it('renders evidence details in a collapsible section', () => { const report: SkillReport = { ...baseReport, skill: 'code-review', @@ -115,7 +115,10 @@ describe('renderSkillReport', () => { title: 'Vitest runs in watch mode in CI', description: 'The test script can hang CI because Vitest watches by default. Use `vitest run` so the job exits after one pass.', - verification: 'Checked package.json scripts and Vitest default behavior.', + verification: [ + '- `package.json` still runs `vitest` without `run`.', + '- Vitest defaults to watch mode outside explicit run mode.', + ].join('\n'), location: { path: 'package.json', startLine: 10, @@ -129,8 +132,9 @@ describe('renderSkillReport', () => { expect(body).toContain('Vitest runs in watch mode in CI'); expect(body).toContain('The test script can hang CI'); - expect(body).toContain('
Verification'); - expect(body).toContain('Checked package.json scripts'); + expect(body).toContain('
Evidence'); + expect(body).not.toContain('
Verification'); + expect(body).toContain('- `package.json` still runs `vitest` without `run`.'); }); it('includes deduplication marker in comments', () => { @@ -1314,7 +1318,7 @@ describe('renderFindingsBody', () => { expect(body).toContain('Identified by Warden security-review'); }); - it('renders verification details in body findings', () => { + it('renders evidence details in body findings', () => { const findings = [ { id: 'f1', @@ -1329,7 +1333,8 @@ describe('renderFindingsBody', () => { const body = renderFindingsBody(findings, 'code-review'); expect(body).toContain('Short visible comment'); - expect(body).toContain('
Verification'); + expect(body).toContain('
Evidence'); + expect(body).not.toContain('
Verification'); expect(body).toContain('Checked the fallback review body path.'); }); diff --git a/src/output/renderer.ts b/src/output/renderer.ts index 995ef5b3..b1b3a832 100644 --- a/src/output/renderer.ts +++ b/src/output/renderer.ts @@ -70,7 +70,7 @@ function renderReview( } let body = `**${escapeHtml(finding.title)}**\n\n${escapeHtml(finding.description)}`; - if (finding.verification) { + if (finding.verification?.trim()) { body += `\n\n${renderVerification(finding.verification)}`; } @@ -166,7 +166,7 @@ function renderSuggestion(description: string, diff: string): string { } function renderVerification(verification: string): string { - return `
Verification\n\n${escapeHtml(verification)}\n\n
`; + return `
Evidence\n\n${escapeHtml(verification.trim())}\n\n
`; } function renderHiddenFindingsLink(hiddenCount: number, checkRunUrl: string): string { @@ -279,7 +279,7 @@ export function renderFindingsBody(findings: Finding[], skill: string): string { lines.push(''); lines.push(escapeHtml(finding.description)); lines.push(''); - if (finding.verification) { + if (finding.verification?.trim()) { lines.push(renderVerification(finding.verification)); lines.push(''); } diff --git a/src/sdk/prompt.ts b/src/sdk/prompt.ts index aee81ff2..84f43102 100644 --- a/src/sdk/prompt.ts +++ b/src/sdk/prompt.ts @@ -25,14 +25,14 @@ export function buildHunkSystemPrompt(skill: SkillDefinition): string { You are a code analysis agent for Warden. You evaluate code changes against specific skill criteria and report findings ONLY when the code violates or conflicts with those criteria. You do not perform general code review or report issues outside the skill's scope. `, - ` + ` Before reporting a finding: 1. Read the relevant source code to understand the full context 2. Trace through the code path — follow imports, base classes, and indirect references, not just the immediate file 3. Verify your assumptions — confirm the issue exists, don't infer from incomplete information 4. Ensure the finding references lines within the hunk being analyzed -5. Document your verification in the 'verification' field of each finding -`, +5. Document the evidence trace in the 'verification' field of each finding +`, ` The following defines the ONLY criteria you should evaluate. Do not report findings outside this scope: @@ -42,7 +42,7 @@ ${skill.prompt} buildJsonOutputSection(` Example response format: -{"findings": [{"id": "example-1", "severity": "medium", "confidence": "high", "title": "Issue title", "description": "Description", "location": {"path": "file.ts", "startLine": 10}}]} +{"findings": [{"id": "example-1", "severity": "medium", "confidence": "high", "title": "Issue title", "description": "Description", "location": {"path": "file.ts", "startLine": 10}, "verification": "- \`startRun()\` passes the changed value into \`finishRun()\`.\\n- The caller does not guard this case before calling \`startRun()\`."}]} Full schema: { @@ -58,7 +58,7 @@ Full schema: "startLine": 10, "endLine": 15 }, - "verification": "Required. Detailed evidence for the collapsible verification block: files/functions checked, trigger conditions, expected vs actual behavior, and why mitigations do not apply.", + "verification": "Required. Evidence for the public Evidence block. Write 2-5 short Markdown bullets tracing the concrete code path, guard, condition, or behavior that makes the finding real. Use function/file names when useful. Do not use checklist labels, generic reasoning, or restate the description.", "suggestedFix": { "description": "How to fix this issue", "diff": "unified diff format" @@ -77,7 +77,9 @@ Requirements: - The fix would be incomplete or you're uncertain about the correct solution - The fix requires changes to a different file or a new file (briefly name the fix in the description field instead) - "description" is rendered directly in GitHub inline comments. Keep it brief and actionable, usually one sentence. -- Put proof, trace notes, checked files, and expected/actual breakdowns in "verification", not "description". +- Put the concrete evidence trace in "verification", not "description". +- Write "verification" as evidence, not reasoning: facts from the code path, guards, conditions, and observed behavior that make the finding believable. +- Do not format "verification" as any labeled checklist or template. - Do not include severity, confidence, finding ID, skill name, or generic review framing in "description". - Focus your analysis on the code changes in the hunk. Surrounding context and tool results are for understanding only -- all findings must reference lines within the hunk range. `), diff --git a/src/sdk/runner.test.ts b/src/sdk/runner.test.ts index 5d206ea0..e9781306 100644 --- a/src/sdk/runner.test.ts +++ b/src/sdk/runner.test.ts @@ -449,6 +449,13 @@ describe('buildSystemPrompt', () => { expect(prompt).not.toContain('assets/'); }); + it('asks for a compact evidence trace in finding verification', () => { + const prompt = buildSystemPrompt(baseSkill); + expect(prompt).toContain('Evidence for the public Evidence block'); + expect(prompt).toContain('2-5 short Markdown bullets tracing the concrete code path'); + expect(prompt).toContain('Do not format "verification" as any labeled checklist or template'); + }); + it('does not include resource guidance when rootDir has no resource directories', () => { const skill: SkillDefinition = { ...baseSkill, rootDir: tempDir }; const prompt = buildSystemPrompt(skill); diff --git a/src/sdk/verify.test.ts b/src/sdk/verify.test.ts index 35c67be7..50c8b8b2 100644 --- a/src/sdk/verify.test.ts +++ b/src/sdk/verify.test.ts @@ -112,6 +112,12 @@ describe('verifyFindings', () => { options: expect.objectContaining({ model: 'claude-haiku-4-5' }), userPrompt: expect.stringContaining(''), })); + expect(runtime.runSkill).toHaveBeenCalledWith(expect.objectContaining({ + systemPrompt: expect.stringContaining('public Evidence block'), + })); + expect(runtime.runSkill).toHaveBeenCalledWith(expect.objectContaining({ + systemPrompt: expect.stringContaining('Do not use checklist labels'), + })); expect(runtime.runSkill).toHaveBeenCalledWith(expect.objectContaining({ userPrompt: expect.stringContaining(''), })); diff --git a/src/sdk/verify.ts b/src/sdk/verify.ts index 3d1868f7..a27190fa 100644 --- a/src/sdk/verify.ts +++ b/src/sdk/verify.ts @@ -83,12 +83,16 @@ ${skill.prompt} - Keep findings only when the issue is still real after tracing. -- Revise findings when the issue is real but the severity, confidence, title, description, or verification needs a narrower scope. +- Revise findings when the issue is real but the severity, confidence, title, description, or evidence trace needs a narrower scope. - Reject findings when the path is mitigated, unreachable, intentional, outside skill scope, or lacks a concrete code-level violation of the skill criteria. - Do not reject solely because broader repository invariants or caller behavior are incomplete in the inspected context. If the changed code shows a concrete source, boundary, and sink with no verified mitigation, keep or revise the finding. - When reachability or impact is plausible but not fully proven, keep the finding and revise severity, confidence, or scope instead of rejecting it. + +For revised findings, write the "verification" field as evidence for the public Evidence block: 2-5 short Markdown bullets tracing the concrete code path, guard, condition, or behavior that makes the finding real. Use function/file names when useful. Do not use checklist labels, generic reasoning, or restate the description. + + ${buildJsonOutputSection(` {"verdict":"keep|revise|reject","finding":{...},"reason":"short reason"}