diff --git a/.github/workflows/evaluate-pr-candidate.yml b/.github/workflows/evaluate-pr-candidate.yml new file mode 100644 index 0000000..e2ca8da --- /dev/null +++ b/.github/workflows/evaluate-pr-candidate.yml @@ -0,0 +1,64 @@ +name: Evaluate PR Candidate + +on: + issues: + types: [opened] + +jobs: + evaluate: + runs-on: ubuntu-latest + # Only run if issue body contains a GitHub PR URL + if: contains(github.event.issue.body, 'github.com') && contains(github.event.issue.body, '/pull/') + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v1 + with: + bun-version: 1.2.21 + + - name: Install dependencies + run: bun install + + - name: Extract PR URL from issue + id: extract + run: | + # Extract first PR URL from issue body + PR_URL=$(echo '${{ github.event.issue.body }}' | grep -oE 'https://github\.com/[^/]+/[^/]+/pull/[0-9]+' | head -1) + echo "pr_url=${PR_URL}" >> $GITHUB_OUTPUT + if [ -z "$PR_URL" ]; then + echo "No valid PR URL found" + echo "found=false" >> $GITHUB_OUTPUT + else + echo "Found PR URL: $PR_URL" + echo "found=true" >> $GITHUB_OUTPUT + fi + + - name: Evaluate PR candidate + if: steps.extract.outputs.found == 'true' + env: + OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_URL: ${{ steps.extract.outputs.pr_url }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPO_OWNER: ${{ github.repository_owner }} + REPO_NAME: ${{ github.event.repository.name }} + run: bun github/evaluate-pr.ts + + - name: Handle missing PR URL + if: steps.extract.outputs.found == 'false' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh issue comment ${{ github.event.issue.number }} --body "## Invalid Submission + + No valid GitHub PR URL was found in the issue body. + + Please provide a PR URL in the format: + \`\`\` + https://github.com/owner/repo/pull/123 + \`\`\` + + Then create a new issue with the correct URL." diff --git a/cli.ts b/cli.ts index 54731c2..b04f4ce 100644 --- a/cli.ts +++ b/cli.ts @@ -8,6 +8,8 @@ import { Task } from "~/src/tasks/index.js"; import { Summarizer } from "~/src/summarizer.js"; import { Logger } from "~/src/util/logger.js"; import { Eval } from "./src/eval.js"; +import { PrEval } from "./src/pr-eval/index.js"; +import { Reporter } from "./src/pr-eval/reporter.js"; const cli = yargs(hideBin(process.argv)) .scriptName("orvl") @@ -33,6 +35,60 @@ cli.command( }, ); +cli.command( + "evaluate-pr ", + "Evaluate a PR as a benchmark candidate", + async (yargs) => + yargs + .positional("url", { + type: "string", + description: "GitHub PR URL (e.g., https://github.com/owner/repo/pull/123)", + required: true, + }) + .option("output", { + type: "string", + description: "Output file path for JSON results", + }) + .example([ + ["orvl evaluate-pr https://github.com/owner/repo/pull/123"], + ["orvl evaluate-pr https://github.com/owner/repo/pull/123 --output result.json"], + ]), + async ({ url, output }) => { + if (!url) throw new Error("PR URL is required"); + + const logger = Logger.create("[pr-eval]"); + + const result = await PrEval.evaluate(url, { logger }); + + // Print summary + logger.log(""); + logger.log("=".repeat(60)); + logger.log(`Final Score: ${result.finalScore.toFixed(1)}/100`); + logger.log(`Recommendation: ${result.recommendation.toUpperCase()}`); + logger.log("=".repeat(60)); + logger.log(""); + + result.criteria.forEach((c) => { + const consensus = PrEval.getConsensusLevel(c.variance); + logger.log(`${c.displayName}: ${c.average.toFixed(0)}/100 (${consensus} consensus)`); + c.judges.forEach((j) => { + logger.log(` - ${j.judge}: ${j.score}/100`); + }); + }); + + if (output) { + await writeFile(output, JSON.stringify(result, null, 2)); + logger.log(`\nResults saved to ${output}`); + } + + // Also print the formatted comment + logger.log("\n" + "=".repeat(60)); + logger.log("FORMATTED COMMENT PREVIEW:"); + logger.log("=".repeat(60) + "\n"); + logger.log(Reporter.formatComment(result)); + }, +); + cli.command( "$0 [agent]", "Run benchmark", diff --git a/github/evaluate-pr.ts b/github/evaluate-pr.ts new file mode 100644 index 0000000..2434bd3 --- /dev/null +++ b/github/evaluate-pr.ts @@ -0,0 +1,82 @@ +#!/usr/bin/env bun +import { Logger } from "../src/util/logger.js"; +import { PrEval } from "../src/pr-eval/index.js"; +import { Reporter } from "../src/pr-eval/reporter.js"; +import { addIssueComment, addIssueLabels } from "../src/util/github.js"; + +const prUrl = process.env.PR_URL; +const issueNumber = process.env.ISSUE_NUMBER; +const repoOwner = process.env.REPO_OWNER; +const repoName = process.env.REPO_NAME; + +if (!prUrl) { + console.error("PR_URL environment variable is required"); + process.exit(1); +} + +if (!issueNumber) { + console.error("ISSUE_NUMBER environment variable is required"); + process.exit(1); +} + +if (!repoOwner || !repoName) { + console.error("REPO_OWNER and REPO_NAME environment variables are required"); + process.exit(1); +} + +const issueNum = parseInt(issueNumber, 10); +if (isNaN(issueNum)) { + console.error("ISSUE_NUMBER must be a valid number"); + process.exit(1); +} + +const logger = Logger.create("[pr-eval]"); + +async function main() { + logger.log(`Evaluating PR: ${prUrl}`); + logger.log(`Will post results to issue #${issueNum}`); + + try { + const result = await PrEval.evaluate(prUrl, { logger }); + + const comment = Reporter.formatComment(result); + const labels = Reporter.getLabels(result); + + logger.log(`Posting comment to ${repoOwner}/${repoName}#${issueNum}...`); + await addIssueComment(repoOwner, repoName, issueNum, comment); + + logger.log(`Adding labels: ${labels.join(", ")}...`); + await addIssueLabels(repoOwner, repoName, issueNum, labels); + + logger.log(`Evaluation complete: ${result.recommendation}`); + logger.log(`Final score: ${result.finalScore.toFixed(1)}/100`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + logger.error(`Evaluation failed: ${message}`); + + const errorComment = `## Evaluation Failed + +Unable to evaluate PR candidate: + +\`\`\` +${message} +\`\`\` + +Please check that: +- The PR URL is valid and the repository is public +- The PR exists and is accessible + +If the issue persists, please contact a maintainer.`; + + try { + await addIssueComment(repoOwner, repoName, issueNum, errorComment); + await addIssueLabels(repoOwner, repoName, issueNum, ["benchmark-evaluation-failed"]); + } catch (commentError) { + logger.error(`Failed to post error comment: ${commentError}`); + } + + process.exit(1); + } +} + +main(); diff --git a/src/pr-eval/criteria/eval-feasibility.ts b/src/pr-eval/criteria/eval-feasibility.ts new file mode 100644 index 0000000..a9361d1 --- /dev/null +++ b/src/pr-eval/criteria/eval-feasibility.ts @@ -0,0 +1,138 @@ +import type { PrEvalContext } from "../fetcher.js"; + +export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be FEASIBLY EVALUATED as a benchmark task. + +**YOUR ROLE**: Determine if an AI agent's attempt to reproduce this PR can be objectively scored. + +IMPORTANT: You must give a score from 0-100. Be strict but fair. + +--- + +## WHAT TO EVALUATE + +### Diff Size & Complexity: +1. **Appropriate diff size** + - Not too small (< 20 lines) - trivially simple + - Not too large (> 1000 lines) - unmanageable for evaluation + - Ideal: 50-500 lines of meaningful changes + +2. **Complexity balance** + - Complex enough to be a meaningful challenge + - Not so complex that evaluation becomes ambiguous + - Should require thought, not just transcription + +### Deterministic Verification: +1. **Objective success criteria** + - Can success be measured programmatically? + - Are there clear pass/fail conditions? + - Can we run automated checks (tests, linting, builds)? + +2. **Test-based verification** + - Existing tests that must continue passing + - New tests that verify the specific change + - Build/lint checks that must succeed + +3. **Diff-based verification** + - Key code patterns identifiable in expected output + - Logic equivalence can be assessed + - Not purely stylistic where any approach works + +### Practical Constraints: +1. **Environment requirements** + - No special hardware needed (GPU, specific OS) + - No paid API keys required for testing + - Standard development environment sufficient + +2. **Time constraints** + - Can be completed in reasonable time (< 30 min agent runtime) + - No long-running processes required for verification + - Dependencies can be installed quickly + +3. **External dependencies** + - No external services needed during evaluation + - Self-contained within the repository + - No authentication to external systems + +--- + +## SCORING RUBRIC + +**90-100**: Highly feasible +- Clear verification via automated tests +- Appropriate diff size (100-500 lines) +- Deterministic success criteria +- Standard environment, no special requirements +- Quick setup and verification + +**70-89**: Feasible with minor challenges +- Verification possible but requires some interpretation +- Slightly outside ideal size range +- Some ambiguity in success criteria +- Minor setup complexity + +**50-69**: Marginally feasible +- Verification would be challenging +- Size at the edges (very small or quite large) +- Success criteria unclear in places +- Some environment complexity + +**30-49**: Difficult to evaluate +- Verification very challenging or subjective +- Inappropriate size for benchmark +- Mostly subjective success criteria +- Complex environment or dependencies + +**0-29**: Not feasible +- No clear verification method +- Way too large (1000+ lines) or trivially small (< 10 lines) +- Requires external services or paid APIs +- Cannot be evaluated objectively + +--- + +Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`; + +export function createUserPrompt(context: PrEvalContext): string { + const filesPreview = context.files + .slice(0, 20) + .map((f) => ` - ${f.filename} (${f.changes} changes)`) + .join("\n"); + + return `Evaluate this Pull Request for EVALUATION FEASIBILITY as a benchmark task. + +## PR Information + +**Title:** ${context.title} +**Repository:** ${context.owner}/${context.repo} + +## Statistics + +- Files changed: ${context.diffStats.filesChanged} +- Lines added: ${context.diffStats.additions} +- Lines deleted: ${context.diffStats.deletions} +- **Total line changes: ${context.diffStats.totalLines}** +- Contains test files: ${context.hasTests ? "Yes" : "No"} + +## Files Changed + +${filesPreview} +${context.files.length > 20 ? `... and ${context.files.length - 20} more files` : ""} + +## PR Description + +${context.body || "(No description provided)"} + +## Diff Size Assessment + +The diff is ${context.diffStats.totalLines} lines total. +${context.diffTruncated ? "Note: The full diff was truncated due to size (>50K chars), indicating a very large PR." : ""} + +## Key Questions to Answer + +1. **Size appropriateness**: Is ${context.diffStats.totalLines} lines a reasonable size for a benchmark task? +2. **Verification method**: Can we verify correctness through tests, builds, or diff comparison? +3. **Environment needs**: Does this require any special setup, external services, or paid APIs? +4. **Time feasibility**: Can an AI agent reasonably complete this in under 30 minutes? + +Based on the above, evaluate the EVALUATION FEASIBILITY of using this PR as a benchmark task.`; +} diff --git a/src/pr-eval/criteria/index.ts b/src/pr-eval/criteria/index.ts new file mode 100644 index 0000000..6085109 --- /dev/null +++ b/src/pr-eval/criteria/index.ts @@ -0,0 +1,43 @@ +import type { PrEvalContext } from "../fetcher.js"; +import * as scopeClarity from "./scope-clarity.js"; +import * as technicalQuality from "./technical-quality.js"; +import * as evalFeasibility from "./eval-feasibility.js"; +import * as reproducibility from "./reproducibility.js"; + +export interface Criterion { + systemPrompt: string; + createUserPrompt: (context: PrEvalContext) => string; +} + +export interface CriterionConfig { + criterion: Criterion; + weight: number; + displayName: string; +} + +export namespace PrCriteria { + export const all: Record = { + "scope-clarity": { + criterion: scopeClarity, + weight: 0.25, + displayName: "Scope & Clarity", + }, + "technical-quality": { + criterion: technicalQuality, + weight: 0.25, + displayName: "Technical Quality", + }, + "eval-feasibility": { + criterion: evalFeasibility, + weight: 0.25, + displayName: "Evaluation Feasibility", + }, + reproducibility: { + criterion: reproducibility, + weight: 0.25, + displayName: "Reproducibility", + }, + }; + + export const names = Object.keys(all) as Array; +} diff --git a/src/pr-eval/criteria/reproducibility.ts b/src/pr-eval/criteria/reproducibility.ts new file mode 100644 index 0000000..6d3c6bd --- /dev/null +++ b/src/pr-eval/criteria/reproducibility.ts @@ -0,0 +1,157 @@ +import type { PrEvalContext } from "../fetcher.js"; + +export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be REPRODUCED as a benchmark task. + +**YOUR ROLE**: Determine if this PR can be converted into a clear, reproducible coding task that an AI agent could attempt. + +IMPORTANT: You must give a score from 0-100. Be strict but fair. + +--- + +## WHAT TO EVALUATE + +### Task Derivability: +1. **Clear starting point** + - Base commit is identifiable (the PR base branch) + - Repository state is reproducible + - No hidden dependencies or uncommitted prerequisites + +2. **Prompt generation potential** + - Intent can be described without revealing exact implementation + - Task can be phrased as a natural developer request + - Sufficient context available from PR title, description, and commits + - The "what" is clear even if exact "how" varies + +3. **Expected outcome clarity** + - Target state is well-defined + - Acceptance criteria can be derived from the PR + - Multiple valid implementations could potentially exist + - Success is about behavior, not exact code match + +### External Dependencies: +1. **Repository accessibility** + - Public repository accessible without authentication + - No private dependencies or internal packages + - No proprietary tools required + +2. **Environment reproducibility** + - Standard language/framework versions + - Dependencies are installable via package managers + - No proprietary or licensed software required + +3. **Data requirements** + - No external datasets needed + - No API calls to external services required for the task + - Self-contained within the codebase + +### Real-world Task Characteristics: +1. **Natural task framing** + - Could be a real developer request from a PM or lead + - Not artificially constructed or contrived + - Represents genuine development work + +2. **Documentation sufficiency** + - Enough context to understand the goal from PR alone + - Not requiring deep institutional/tribal knowledge + - Reasonable learning curve for understanding the codebase area + +3. **Isolation** + - Changes don't depend on simultaneous other PRs + - Can be applied cleanly to base branch + - No merge conflict complexity + +--- + +## SCORING RUBRIC + +**90-100**: Highly reproducible +- Clear base commit and branch +- Excellent PR description explaining intent +- No external dependencies +- Natural task that could be real developer work +- Standard, accessible environment +- Self-contained changes + +**70-89**: Reproducible with minor effort +- Good starting point identified +- Adequate description, some gaps +- Minimal external requirements +- Reasonable task framing +- Minor setup complexity + +**50-69**: Reproducible with effort +- Starting point needs some clarification +- Limited description or context +- Some setup complexity +- Task framing somewhat unusual +- May need domain knowledge + +**30-49**: Difficult to reproduce +- Unclear starting point or prerequisites +- Poor or no documentation +- Complex dependencies or setup +- Artificial or contrived feel +- Heavy domain knowledge required + +**0-29**: Not reproducible +- No clear starting state +- No documentation of intent +- Heavy external dependencies +- Cannot be converted to standalone task +- Private or inaccessible resources required + +--- + +Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`; + +export function createUserPrompt(context: PrEvalContext): string { + const commitsPreview = context.commits + .slice(0, 10) + .map((c) => ` - ${c.message.split("\n")[0]}`) + .join("\n"); + + const filesPreview = context.files + .slice(0, 15) + .map((f) => ` - ${f.filename}`) + .join("\n"); + + return `Evaluate this Pull Request for REPRODUCIBILITY as a benchmark task. + +## PR Information + +**Title:** ${context.title} +**Repository:** ${context.owner}/${context.repo} +**Base Branch:** ${context.baseBranch} +**Head Branch:** ${context.headBranch} +**PR Number:** #${context.prNumber} + +## PR Description + +${context.body || "(No description provided)"} + +## Commit History + +${commitsPreview} +${context.commits.length > 10 ? `... and ${context.commits.length - 10} more commits` : ""} + +## Files Modified + +${filesPreview} +${context.files.length > 15 ? `... and ${context.files.length - 15} more files` : ""} + +## Statistics + +- Files changed: ${context.diffStats.filesChanged} +- Lines added: ${context.diffStats.additions} +- Lines deleted: ${context.diffStats.deletions} + +## Key Questions to Answer + +1. **Starting point**: Can we clearly identify where an agent should start (base commit)? +2. **Task description**: Can we derive a natural task prompt from this PR's description and commits? +3. **Dependencies**: Does this require external services, private packages, or special access? +4. **Environment**: Can this be run in a standard development environment? +5. **Clarity**: Would a developer understand what to build from the PR description alone? + +Based on the above, evaluate the REPRODUCIBILITY of converting this PR into a benchmark task.`; +} diff --git a/src/pr-eval/criteria/scope-clarity.ts b/src/pr-eval/criteria/scope-clarity.ts new file mode 100644 index 0000000..2981a7b --- /dev/null +++ b/src/pr-eval/criteria/scope-clarity.ts @@ -0,0 +1,139 @@ +import type { PrEvalContext } from "../fetcher.js"; + +export const systemPrompt = `You are evaluating whether a GitHub Pull Request is suitable as a benchmark task based on SCOPE and CLARITY. + +**YOUR ROLE**: Determine if this PR has clear, self-contained changes that could be reproduced as a coding task for evaluating AI coding agents. + +IMPORTANT: You must give a score from 0-100. Be strict but fair. + +--- + +## WHAT TO EVALUATE + +### Scope Assessment: +1. **Self-containment** - Are the changes isolated and independent? + - Does NOT depend on external PRs or uncommitted changes + - Changes are localized to related files + - No sprawling changes across unrelated modules + +2. **Appropriate size** - Is the scope meaningful but not overwhelming? + - Too small: Typo fixes, single-line config changes, version bumps + - Too large: 50+ files, multiple unrelated features bundled together + - Ideal: 3-30 files, single cohesive feature or fix + +3. **Focused intent** - Does it solve ONE clear problem? + - Single feature addition + - Single bug fix + - Single refactoring goal + - NOT multiple unrelated changes bundled together + +### Clarity Assessment: +1. **PR description quality** - Is the intent clear? + - Explains WHAT is being changed + - Explains WHY it's needed + - Has clear acceptance criteria (explicit or implicit) + +2. **Commit message quality** - Do commits tell a story? + - Meaningful commit messages + - Logical commit progression + - Not just "fix" or "update" + +3. **Code readability** - Can an AI agent understand the goal? + - Changes are understandable without deep domain knowledge + - Intent is clear from the diff itself + - Not overly complex or cryptic + +--- + +## SCORING RUBRIC + +**90-100**: Excellent benchmark candidate +- Single, clear purpose evident from title and description +- Well-documented PR with context +- 3-20 files changed with focused changes +- Self-contained with no external dependencies +- Clear success criteria derivable from the PR + +**70-89**: Good candidate with minor issues +- Clear purpose but could be better documented +- Slightly too large or too small +- Minor scope creep (1-2 unrelated changes) +- Most context is clear + +**50-69**: Marginal candidate +- Purpose somewhat unclear +- Moderate scope issues (too broad or too narrow) +- Would need significant context to reproduce +- Description is sparse or confusing + +**30-49**: Poor candidate +- Unclear purpose +- Too large (50+ files) or too fragmented +- Hard to understand intent from PR alone +- Multiple unrelated changes bundled + +**0-29**: Not suitable +- No clear purpose or description +- Massive scope or trivially small +- Impossible to derive clear task +- Depends on external context unavailable + +--- + +Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`; + +export function createUserPrompt(context: PrEvalContext): string { + const filesPreview = context.files + .slice(0, 30) + .map((f) => ` - ${f.filename} (${f.status}, ${f.changes} changes)`) + .join("\n"); + + const commitsPreview = context.commits + .slice(0, 10) + .map((c) => ` - ${c.message.split("\n")[0]}`) + .join("\n"); + + const diffPreview = + context.diff.length > 5000 + ? context.diff.slice(0, 5000) + "\n... [diff truncated for evaluation]" + : context.diff; + + return `Evaluate this Pull Request for SCOPE and CLARITY as a benchmark task candidate. + +## PR Information + +**Title:** ${context.title} +**Repository:** ${context.owner}/${context.repo} +**PR Number:** #${context.prNumber} +**Base Branch:** ${context.baseBranch} <- ${context.headBranch} + +## PR Description + +${context.body || "(No description provided)"} + +## Statistics + +- Files changed: ${context.diffStats.filesChanged} +- Lines added: ${context.diffStats.additions} +- Lines deleted: ${context.diffStats.deletions} +- Total commits: ${context.commits.length} + +## Files Changed + +${filesPreview} +${context.files.length > 30 ? `... and ${context.files.length - 30} more files` : ""} + +## Commit Messages + +${commitsPreview} +${context.commits.length > 10 ? `... and ${context.commits.length - 10} more commits` : ""} + +## Diff Preview + +\`\`\`diff +${diffPreview} +\`\`\` +${context.diffTruncated ? "\n(Note: Diff was truncated due to size)" : ""} + +Based on the above, evaluate the SCOPE and CLARITY of this PR for use as a benchmark task.`; +} diff --git a/src/pr-eval/criteria/technical-quality.ts b/src/pr-eval/criteria/technical-quality.ts new file mode 100644 index 0000000..2666b41 --- /dev/null +++ b/src/pr-eval/criteria/technical-quality.ts @@ -0,0 +1,129 @@ +import type { PrEvalContext } from "../fetcher.js"; + +export const systemPrompt = `You are evaluating whether a GitHub Pull Request demonstrates TECHNICAL QUALITY suitable for a coding benchmark. + +**YOUR ROLE**: Assess if this PR touches meaningful code and has proper testing, making it suitable for evaluating AI coding agents. + +IMPORTANT: You must give a score from 0-100. Be strict but fair. + +--- + +## WHAT TO EVALUATE + +### Code Meaningfulness: +1. **Substance over style** + - NOT just formatting/linting changes + - NOT just dependency updates or version bumps + - Contains actual logic changes (conditionals, functions, data flow) + +2. **Technical depth** + - Involves decision-making (conditionals, algorithms, data structures) + - Requires understanding of the codebase patterns + - Has technical challenge (not just boilerplate) + +3. **Real-world relevance** + - Represents actual production work + - Not a toy example or demo + - Solves a real problem or adds real functionality + +### Test Infrastructure: +1. **Test coverage** + - Adds or modifies tests alongside code changes + - Tests are meaningful (not just mocks or stubs) + - Tests verify the actual behavioral change + +2. **Test executability** + - Tests can be run independently + - Clear test framework in use (pytest, jest, go test, etc.) + - Tests don't require complex external setup + +3. **Verification potential** + - Changes can be verified programmatically + - Success criteria is testable (not subjective) + - Can determine pass/fail objectively + +--- + +## SCORING RUBRIC + +**90-100**: Excellent technical quality +- Significant logic changes with clear technical depth +- Comprehensive test additions or modifications +- Clear verification path through automated tests +- Production-ready, non-trivial implementation + +**70-89**: Good technical quality +- Meaningful code changes with some complexity +- Some test coverage included +- Reasonable verification possible +- Real functionality added/modified + +**50-69**: Moderate technical quality +- Some meaningful changes mixed with trivial ones +- Limited or no test coverage +- Verification might be challenging +- Borderline complexity + +**30-49**: Limited technical quality +- Mostly trivial or cosmetic changes +- Little to no test coverage +- Hard to verify correctness +- Minimal technical challenge + +**0-29**: Poor technical quality +- Only formatting, config, or dependency changes +- No tests whatsoever +- No clear verification method +- No real logic changes + +--- + +Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`; + +export function createUserPrompt(context: PrEvalContext): string { + const filesPreview = context.files + .slice(0, 30) + .map((f) => ` - ${f.filename} (${f.status}, ${f.changes} changes)`) + .join("\n"); + + const diffPreview = + context.diff.length > 8000 + ? context.diff.slice(0, 8000) + "\n... [diff truncated for evaluation]" + : context.diff; + + return `Evaluate this Pull Request for TECHNICAL QUALITY as a benchmark task candidate. + +## PR Information + +**Title:** ${context.title} +**Repository:** ${context.owner}/${context.repo} + +## PR Description + +${context.body || "(No description provided)"} + +## Statistics + +- Files changed: ${context.diffStats.filesChanged} +- Lines added: ${context.diffStats.additions} +- Lines deleted: ${context.diffStats.deletions} +- Contains test files: ${context.hasTests ? "Yes" : "No"} + +## Files Changed + +${filesPreview} +${context.files.length > 30 ? `... and ${context.files.length - 30} more files` : ""} + +## Full Diff + +\`\`\`diff +${diffPreview} +\`\`\` +${context.diffTruncated ? "\n(Note: Diff was truncated due to size)" : ""} + +Based on the above, evaluate the TECHNICAL QUALITY of this PR: +1. Does it contain meaningful logic changes (not just formatting/config)? +2. Does it include or modify tests? +3. Can the changes be verified programmatically? +4. Is there sufficient technical depth for a benchmark task?`; +} diff --git a/src/pr-eval/fetcher.ts b/src/pr-eval/fetcher.ts new file mode 100644 index 0000000..6530188 --- /dev/null +++ b/src/pr-eval/fetcher.ts @@ -0,0 +1,103 @@ +import { fetchPullRequest, type PullRequestData } from "../util/github.js"; +import { parsePrUrl } from "./parser.js"; + +const MAX_DIFF_LENGTH = 50000; + +export interface PrEvalContext { + url: string; + owner: string; + repo: string; + prNumber: number; + title: string; + body: string; + diff: string; + diffTruncated: boolean; + diffStats: { + filesChanged: number; + additions: number; + deletions: number; + totalLines: number; + }; + commits: Array<{ sha: string; message: string }>; + files: Array<{ + filename: string; + status: string; + changes: number; + }>; + hasTests: boolean; + baseBranch: string; + headBranch: string; +} + +const TEST_FILE_PATTERNS = [ + /test[s]?\//i, + /spec[s]?\//i, + /__tests__\//i, + /\.test\.[jt]sx?$/i, + /\.spec\.[jt]sx?$/i, + /_test\.go$/i, + /_test\.py$/i, + /test_.*\.py$/i, + /\.test\.rs$/i, +]; + +function isTestFile(filename: string): boolean { + return TEST_FILE_PATTERNS.some((pattern) => pattern.test(filename)); +} + +export async function fetchPrContext(prUrl: string): Promise { + const parsed = parsePrUrl(prUrl); + if (!parsed) { + throw new Error(`Invalid GitHub PR URL: ${prUrl}`); + } + + const { owner, repo, prNumber } = parsed; + + let prData: PullRequestData; + try { + prData = await fetchPullRequest(owner, repo, prNumber); + } catch (error) { + if (error instanceof Error && error.message.includes("404")) { + throw new Error( + `PR not found or not accessible: ${owner}/${repo}#${prNumber}. ` + + `Make sure the repository is public and the PR exists.`, + ); + } + throw error; + } + + let diff = prData.diff; + let diffTruncated = false; + if (diff.length > MAX_DIFF_LENGTH) { + diff = diff.slice(0, MAX_DIFF_LENGTH); + diffTruncated = true; + } + + const hasTests = prData.files.some((f) => isTestFile(f.filename)); + + return { + url: prUrl, + owner, + repo, + prNumber, + title: prData.title, + body: prData.body, + diff, + diffTruncated, + diffStats: { + filesChanged: prData.changedFiles, + additions: prData.additions, + deletions: prData.deletions, + totalLines: prData.additions + prData.deletions, + }, + commits: prData.commitMessages, + files: prData.files.map((f) => ({ + filename: f.filename, + status: f.status, + changes: f.changes, + })), + hasTests, + baseBranch: prData.baseBranch, + headBranch: prData.headBranch, + }; +} diff --git a/src/pr-eval/index.ts b/src/pr-eval/index.ts new file mode 100644 index 0000000..50abec3 --- /dev/null +++ b/src/pr-eval/index.ts @@ -0,0 +1,180 @@ +import { z } from "zod"; +import { generateObject } from "ai"; +import { Logger } from "../util/logger.js"; +import { Judge } from "../judges.js"; +import { getZenLanguageModel } from "../zenModels.js"; +import { average, variance, weightedSum } from "../util/math.js"; +import { fetchPrContext, type PrEvalContext } from "./fetcher.js"; +import { PrCriteria } from "./criteria/index.js"; + +export namespace PrEval { + export const DISAGREEMENT_PENALTY = 0.5; + + export interface JudgeScore { + judge: string; + score: number; + rationale: string; + } + + export interface CriterionResult { + criterion: string; + displayName: string; + weight: number; + average: number; + variance: number; + judges: JudgeScore[]; + } + + export type Recommendation = "approved" | "rejected" | "needs-review"; + + export interface EvaluationResult { + prUrl: string; + owner: string; + repo: string; + prNumber: number; + finalScore: number; + baseScore: number; + penalty: number; + recommendation: Recommendation; + criteria: CriterionResult[]; + evaluatedAt: string; + } + + function getRecommendation(score: number): Recommendation { + if (score >= 70) return "approved"; + if (score >= 50) return "needs-review"; + return "rejected"; + } + + export function getConsensusLevel(variance: number): "high" | "medium" | "low" { + if (variance < 100) return "high"; + if (variance < 400) return "medium"; + return "low"; + } + + export async function evaluate( + prUrl: string, + opts: { logger: Logger.Instance }, + ): Promise { + opts.logger.log(`Fetching PR data from ${prUrl}...`); + const context = await fetchPrContext(prUrl); + + opts.logger.log( + `PR: ${context.owner}/${context.repo}#${context.prNumber} - "${context.title}"`, + ); + opts.logger.log( + `Stats: ${context.diffStats.filesChanged} files, +${context.diffStats.additions}/-${context.diffStats.deletions} lines`, + ); + + const allScores: CriterionResult[] = []; + + for (const criterionName of PrCriteria.names) { + const config = PrCriteria.all[criterionName]; + const cl = opts.logger.child(`[${config.displayName}]`); + + cl.log("Evaluating..."); + + const scores: JudgeScore[] = []; + for (const judge of Judge.all) { + const jl = cl.child(`[${judge}]`); + jl.log("Judging..."); + + try { + const result = await judgeScore( + config.criterion.systemPrompt, + config.criterion.createUserPrompt(context), + judge, + { logger: jl }, + ); + scores.push({ judge, ...result }); + jl.log(`Score: ${result.score}/100`); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + jl.error(`Failed: ${msg}`); + scores.push({ judge, score: 0, rationale: `Error: ${msg}` }); + } + } + + const avg = average(scores.map((s) => s.score)); + const vrc = variance( + avg, + scores.map((s) => s.score), + ); + + allScores.push({ + criterion: criterionName, + displayName: config.displayName, + weight: config.weight, + average: avg, + variance: vrc, + judges: scores, + }); + + cl.log(`Average: ${avg.toFixed(1)}/100 (variance: ${vrc.toFixed(1)})`); + } + + // Calculate weighted average (scores are 0-100) + const weightedAvg = weightedSum( + allScores.map(({ average, weight }) => ({ value: average, weight })), + ); + + // Calculate weighted variance for penalty + const weightedVrc = weightedSum( + allScores.map(({ variance, weight }) => ({ value: variance, weight })), + ); + + // Apply penalty (scaled for 0-100 range) + const penalty = DISAGREEMENT_PENALTY * Math.sqrt(weightedVrc); + const finalScore = Math.max(0, Math.min(100, weightedAvg - penalty)); + + opts.logger.log(`Final Score: ${finalScore.toFixed(1)}/100`); + opts.logger.log(`Recommendation: ${getRecommendation(finalScore)}`); + + return { + prUrl, + owner: context.owner, + repo: context.repo, + prNumber: context.prNumber, + finalScore, + baseScore: weightedAvg, + penalty, + recommendation: getRecommendation(finalScore), + criteria: allScores, + evaluatedAt: new Date().toISOString(), + }; + } + + async function judgeScore( + systemPrompt: string, + userPrompt: string, + judge: string, + opts: { logger: Logger.Instance }, + ): Promise<{ score: number; rationale: string }> { + const { object } = await generateObject({ + model: getZenLanguageModel(judge), + schema: z.object({ + score: z + .number() + .min(0) + .max(100) + .describe("Score from 0 to 100"), + rationale: z.string().min(1).describe("Explanation of the score"), + }), + system: systemPrompt, + temperature: 0, + prompt: userPrompt, + }); + + if (!object || typeof object !== "object") { + throw new Error("Judge must return an object."); + } + if (typeof object.score !== "number" || object.score < 0 || object.score > 100) { + throw new Error("Judge must return a score between 0 and 100."); + } + if (typeof object.rationale !== "string" || object.rationale.length === 0) { + throw new Error("Judge must include a rationale."); + } + + return { score: object.score, rationale: object.rationale }; + } +} diff --git a/src/pr-eval/parser.ts b/src/pr-eval/parser.ts new file mode 100644 index 0000000..40c99b4 --- /dev/null +++ b/src/pr-eval/parser.ts @@ -0,0 +1,31 @@ +export interface ParsedPrUrl { + owner: string; + repo: string; + prNumber: number; +} + +const PR_URL_REGEX = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)\/?$/; + +export function parsePrUrl(url: string): ParsedPrUrl | null { + const trimmed = url.trim(); + const match = trimmed.match(PR_URL_REGEX); + + if (!match) { + return null; + } + + const [, owner, repo, prNumberStr] = match; + const prNumber = parseInt(prNumberStr, 10); + + if (isNaN(prNumber) || prNumber <= 0) { + return null; + } + + return { owner, repo, prNumber }; +} + +export function extractPrUrlFromText(text: string): string | null { + const regex = /https?:\/\/github\.com\/[^/]+\/[^/]+\/pull\/\d+/; + const match = text.match(regex); + return match ? match[0] : null; +} diff --git a/src/pr-eval/reporter.ts b/src/pr-eval/reporter.ts new file mode 100644 index 0000000..6c9e7de --- /dev/null +++ b/src/pr-eval/reporter.ts @@ -0,0 +1,129 @@ +import { PrEval } from "./index.js"; + +export namespace Reporter { + const STATUS_ICONS = { + approved: ":white_check_mark:", + rejected: ":x:", + "needs-review": ":warning:", + } as const; + + const STATUS_LABELS = { + approved: "Approved", + rejected: "Rejected", + "needs-review": "Needs Review", + } as const; + + export function formatComment(result: PrEval.EvaluationResult): string { + const statusIcon = STATUS_ICONS[result.recommendation]; + const statusLabel = STATUS_LABELS[result.recommendation]; + + const criteriaTable = result.criteria + .map((c) => { + const consensus = PrEval.getConsensusLevel(c.variance); + const consensusEmoji = + consensus === "high" ? ":green_circle:" : consensus === "medium" ? ":yellow_circle:" : ":red_circle:"; + return `| ${c.displayName} | ${c.average.toFixed(0)}/100 | ${consensusEmoji} ${capitalize(consensus)} |`; + }) + .join("\n"); + + const criteriaDetails = result.criteria + .map((c) => { + const judgeScores = c.judges + .map((j) => `- **${formatJudgeName(j.judge)}**: ${j.score}/100`) + .join("\n"); + + const combinedRationale = c.judges + .map((j) => `**${formatJudgeName(j.judge)}**: ${j.rationale}`) + .join("\n\n"); + + return `
+${c.displayName} (${c.average.toFixed(0)}/100) + +### Judge Scores +${judgeScores} + +### Rationale +${combinedRationale} + +
`; + }) + .join("\n\n"); + + return `## Benchmark Candidate Evaluation + +**PR:** [${result.owner}/${result.repo}#${result.prNumber}](${result.prUrl}) +**Status:** ${statusIcon} **${statusLabel}** +**Final Score:** ${result.finalScore.toFixed(1)}/100 + +--- + +### Criterion Scores + +| Criterion | Score | Consensus | +|-----------|-------|-----------| +${criteriaTable} + +--- + +### Detailed Analysis + +${criteriaDetails} + +--- + +### Scoring Details + +- **Base Score:** ${result.baseScore.toFixed(1)}/100 +- **Disagreement Penalty:** -${result.penalty.toFixed(1)} +- **Final Score:** ${result.finalScore.toFixed(1)}/100 + +${getRecommendationMessage(result.recommendation)} + +--- + +*Evaluated by [opencode-bench](https://github.com/sst/opencode-bench) on ${formatDate(result.evaluatedAt)}* +*Judges: ${result.criteria[0]?.judges.map((j) => formatJudgeName(j.judge)).join(", ")}*`; + } + + export function getLabels(result: PrEval.EvaluationResult): string[] { + const labels = ["benchmark-evaluation"]; + + switch (result.recommendation) { + case "approved": + labels.push("benchmark-approved"); + break; + case "rejected": + labels.push("benchmark-rejected"); + break; + case "needs-review": + labels.push("benchmark-needs-review"); + break; + } + + return labels; + } + + function formatJudgeName(judge: string): string { + return judge.replace("opencode/", "").replace(/-/g, " "); + } + + function capitalize(str: string): string { + return str.charAt(0).toUpperCase() + str.slice(1); + } + + function formatDate(isoDate: string): string { + const date = new Date(isoDate); + return date.toISOString().split("T")[0]; + } + + function getRecommendationMessage(recommendation: PrEval.Recommendation): string { + switch (recommendation) { + case "approved": + return `> :white_check_mark: **This PR appears to be a good candidate for the benchmark.** A maintainer will review and may add it to the evaluation suite.`; + case "needs-review": + return `> :warning: **This PR may be suitable but requires manual review.** A maintainer will evaluate whether it meets benchmark requirements.`; + case "rejected": + return `> :x: **This PR does not appear suitable for the benchmark.** See the detailed analysis above for specific issues. You may submit a different PR or provide additional context.`; + } + } +} diff --git a/src/util/github.ts b/src/util/github.ts index 467c8f7..041c906 100644 --- a/src/util/github.ts +++ b/src/util/github.ts @@ -123,3 +123,151 @@ export async function fetchCommits( return results.filter((value): value is CommitDiff => value !== null); } + +export interface PullRequestData { + number: number; + title: string; + body: string; + state: string; + baseBranch: string; + headBranch: string; + additions: number; + deletions: number; + changedFiles: number; + commits: number; + diff: string; + files: Array<{ + filename: string; + status: string; + additions: number; + deletions: number; + changes: number; + }>; + commitMessages: Array<{ sha: string; message: string }>; +} + +export async function fetchPullRequest( + owner: string, + repo: string, + prNumber: number, +): Promise { + const client = getRequestClient(); + + // Fetch PR metadata + const prResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}", { + owner, + repo, + pull_number: prNumber, + }); + + const prData = prResponse.data as { + number: number; + title: string; + body: string | null; + state: string; + base: { ref: string }; + head: { ref: string }; + additions: number; + deletions: number; + changed_files: number; + commits: number; + }; + + // Fetch PR diff + const diffResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}", { + owner, + repo, + pull_number: prNumber, + headers: { + accept: DIFF_ACCEPT_HEADER, + }, + }); + + const diff = String(diffResponse.data); + + // Fetch PR files + const filesResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}/files", { + owner, + repo, + pull_number: prNumber, + per_page: 100, + }); + + const files = (filesResponse.data as Array<{ + filename: string; + status: string; + additions: number; + deletions: number; + changes: number; + }>).map((f) => ({ + filename: f.filename, + status: f.status, + additions: f.additions, + deletions: f.deletions, + changes: f.changes, + })); + + // Fetch PR commits + const commitsResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}/commits", { + owner, + repo, + pull_number: prNumber, + per_page: 100, + }); + + const commitMessages = (commitsResponse.data as Array<{ + sha: string; + commit: { message: string }; + }>).map((c) => ({ + sha: c.sha, + message: c.commit.message, + })); + + return { + number: prData.number, + title: prData.title, + body: prData.body ?? "", + state: prData.state, + baseBranch: prData.base.ref, + headBranch: prData.head.ref, + additions: prData.additions, + deletions: prData.deletions, + changedFiles: prData.changed_files, + commits: prData.commits, + diff, + files, + commitMessages, + }; +} + +export async function addIssueComment( + owner: string, + repo: string, + issueNumber: number, + body: string, +): Promise { + const client = getRequestClient(); + + await client("POST /repos/{owner}/{repo}/issues/{issue_number}/comments", { + owner, + repo, + issue_number: issueNumber, + body, + }); +} + +export async function addIssueLabels( + owner: string, + repo: string, + issueNumber: number, + labels: string[], +): Promise { + const client = getRequestClient(); + + await client("POST /repos/{owner}/{repo}/issues/{issue_number}/labels", { + owner, + repo, + issue_number: issueNumber, + labels, + }); +}