anomalyco · tmickleydoyle · Jan 12, 2026
diff --git a/.github/workflows/evaluate-pr-candidate.yml b/.github/workflows/evaluate-pr-candidate.yml
@@ -0,0 +1,64 @@
+name: Evaluate PR Candidate
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    # Only run if issue body contains a GitHub PR URL
+    if: contains(github.event.issue.body, 'github.com') && contains(github.event.issue.body, '/pull/')
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install
+
+      - name: Extract PR URL from issue
+        id: extract
+        run: |
+          # Extract first PR URL from issue body
+          PR_URL=$(echo '${{ github.event.issue.body }}' | grep -oE 'https://github\.com/[^/]+/[^/]+/pull/[0-9]+' | head -1)
+          echo "pr_url=${PR_URL}" >> $GITHUB_OUTPUT
+          if [ -z "$PR_URL" ]; then
+            echo "No valid PR URL found"
+            echo "found=false" >> $GITHUB_OUTPUT
+          else
+            echo "Found PR URL: $PR_URL"
+            echo "found=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Evaluate PR candidate
+        if: steps.extract.outputs.found == 'true'
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_URL: ${{ steps.extract.outputs.pr_url }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          REPO_OWNER: ${{ github.repository_owner }}
+          REPO_NAME: ${{ github.event.repository.name }}
+        run: bun github/evaluate-pr.ts
+
+      - name: Handle missing PR URL
+        if: steps.extract.outputs.found == 'false'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh issue comment ${{ github.event.issue.number }} --body "## Invalid Submission
+
+          No valid GitHub PR URL was found in the issue body.
+
+          Please provide a PR URL in the format:
+          \`\`\`
+          https://github.com/owner/repo/pull/123
+          \`\`\`
+
+          Then create a new issue with the correct URL."
diff --git a/cli.ts b/cli.ts
@@ -8,6 +8,8 @@ import { Task } from "~/src/tasks/index.js";
 import { Summarizer } from "~/src/summarizer.js";
 import { Logger } from "~/src/util/logger.js";
 import { Eval } from "./src/eval.js";
+import { PrEval } from "./src/pr-eval/index.js";
+import { Reporter } from "./src/pr-eval/reporter.js";
 
 const cli = yargs(hideBin(process.argv))
   .scriptName("orvl")
@@ -33,6 +35,60 @@ cli.command(
   },
 );
 
+cli.command(
+  "evaluate-pr <url>",
+  "Evaluate a PR as a benchmark candidate",
+  async (yargs) =>
+    yargs
+      .positional("url", {
+        type: "string",
+        description: "GitHub PR URL (e.g., https://github.com/owner/repo/pull/123)",
+        required: true,
+      })
+      .option("output", {
+        type: "string",
+        description: "Output file path for JSON results",
+      })
+      .example([
+        ["orvl evaluate-pr https://github.com/owner/repo/pull/123"],
+        ["orvl evaluate-pr https://github.com/owner/repo/pull/123 --output result.json"],
+      ]),
+  async ({ url, output }) => {
+    if (!url) throw new Error("PR URL is required");
+
+    const logger = Logger.create("[pr-eval]");
+
+    const result = await PrEval.evaluate(url, { logger });
+
+    // Print summary
+    logger.log("");
+    logger.log("=".repeat(60));
+    logger.log(`Final Score: ${result.finalScore.toFixed(1)}/100`);
+    logger.log(`Recommendation: ${result.recommendation.toUpperCase()}`);
+    logger.log("=".repeat(60));
+    logger.log("");
+
+    result.criteria.forEach((c) => {
+      const consensus = PrEval.getConsensusLevel(c.variance);
+      logger.log(`${c.displayName}: ${c.average.toFixed(0)}/100 (${consensus} consensus)`);
+      c.judges.forEach((j) => {
+        logger.log(`  - ${j.judge}: ${j.score}/100`);
+      });
+    });
+
+    if (output) {
+      await writeFile(output, JSON.stringify(result, null, 2));
+      logger.log(`\nResults saved to ${output}`);
+    }
+
+    // Also print the formatted comment
+    logger.log("\n" + "=".repeat(60));
+    logger.log("FORMATTED COMMENT PREVIEW:");
+    logger.log("=".repeat(60) + "\n");
+    logger.log(Reporter.formatComment(result));
+  },
+);
+
 cli.command(
   "$0 [agent]",
   "Run benchmark",

diff --git a/github/evaluate-pr.ts b/github/evaluate-pr.ts
@@ -0,0 +1,82 @@
+#!/usr/bin/env bun
+import { Logger } from "../src/util/logger.js";
+import { PrEval } from "../src/pr-eval/index.js";
+import { Reporter } from "../src/pr-eval/reporter.js";
+import { addIssueComment, addIssueLabels } from "../src/util/github.js";
+
+const prUrl = process.env.PR_URL;
+const issueNumber = process.env.ISSUE_NUMBER;
+const repoOwner = process.env.REPO_OWNER;
+const repoName = process.env.REPO_NAME;
+
+if (!prUrl) {
+  console.error("PR_URL environment variable is required");
+  process.exit(1);
+}
+
+if (!issueNumber) {
+  console.error("ISSUE_NUMBER environment variable is required");
+  process.exit(1);
+}
+
+if (!repoOwner || !repoName) {
+  console.error("REPO_OWNER and REPO_NAME environment variables are required");
+  process.exit(1);
+}
+
+const issueNum = parseInt(issueNumber, 10);
+if (isNaN(issueNum)) {
+  console.error("ISSUE_NUMBER must be a valid number");
+  process.exit(1);
+}
+
+const logger = Logger.create("[pr-eval]");
+
+async function main() {
+  logger.log(`Evaluating PR: ${prUrl}`);
+  logger.log(`Will post results to issue #${issueNum}`);
+
+  try {
+    const result = await PrEval.evaluate(prUrl, { logger });
+
+    const comment = Reporter.formatComment(result);
+    const labels = Reporter.getLabels(result);
+
+    logger.log(`Posting comment to ${repoOwner}/${repoName}#${issueNum}...`);
+    await addIssueComment(repoOwner, repoName, issueNum, comment);
+
+    logger.log(`Adding labels: ${labels.join(", ")}...`);
+    await addIssueLabels(repoOwner, repoName, issueNum, labels);
+
+    logger.log(`Evaluation complete: ${result.recommendation}`);
+    logger.log(`Final score: ${result.finalScore.toFixed(1)}/100`);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    logger.error(`Evaluation failed: ${message}`);
+
+    const errorComment = `## Evaluation Failed
+
+Unable to evaluate PR candidate:
+
+\`\`\`
+${message}
+\`\`\`
+
+Please check that:
+- The PR URL is valid and the repository is public
+- The PR exists and is accessible
+
+If the issue persists, please contact a maintainer.`;
+
+    try {
+      await addIssueComment(repoOwner, repoName, issueNum, errorComment);
+      await addIssueLabels(repoOwner, repoName, issueNum, ["benchmark-evaluation-failed"]);
+    } catch (commentError) {
+      logger.error(`Failed to post error comment: ${commentError}`);
+    }
+
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/src/pr-eval/criteria/eval-feasibility.ts b/src/pr-eval/criteria/eval-feasibility.ts
@@ -0,0 +1,138 @@
+import type { PrEvalContext } from "../fetcher.js";
+
+export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be FEASIBLY EVALUATED as a benchmark task.
+
+**YOUR ROLE**: Determine if an AI agent's attempt to reproduce this PR can be objectively scored.
+
+IMPORTANT: You must give a score from 0-100. Be strict but fair.
+
+---
+
+## WHAT TO EVALUATE
+
+### Diff Size & Complexity:
+1. **Appropriate diff size**
+   - Not too small (< 20 lines) - trivially simple
+   - Not too large (> 1000 lines) - unmanageable for evaluation
+   - Ideal: 50-500 lines of meaningful changes
+
+2. **Complexity balance**
+   - Complex enough to be a meaningful challenge
+   - Not so complex that evaluation becomes ambiguous
+   - Should require thought, not just transcription
+
+### Deterministic Verification:
+1. **Objective success criteria**
+   - Can success be measured programmatically?
+   - Are there clear pass/fail conditions?
+   - Can we run automated checks (tests, linting, builds)?
+
+2. **Test-based verification**
+   - Existing tests that must continue passing
+   - New tests that verify the specific change
+   - Build/lint checks that must succeed
+
+3. **Diff-based verification**
+   - Key code patterns identifiable in expected output
+   - Logic equivalence can be assessed
+   - Not purely stylistic where any approach works
+
+### Practical Constraints:
+1. **Environment requirements**
+   - No special hardware needed (GPU, specific OS)
+   - No paid API keys required for testing
+   - Standard development environment sufficient
+
+2. **Time constraints**
+   - Can be completed in reasonable time (< 30 min agent runtime)
+   - No long-running processes required for verification
+   - Dependencies can be installed quickly
+
+3. **External dependencies**
+   - No external services needed during evaluation
+   - Self-contained within the repository
+   - No authentication to external systems
+
+---
+
+## SCORING RUBRIC
+
+**90-100**: Highly feasible
+- Clear verification via automated tests
+- Appropriate diff size (100-500 lines)
+- Deterministic success criteria
+- Standard environment, no special requirements
+- Quick setup and verification
+
+**70-89**: Feasible with minor challenges
+- Verification possible but requires some interpretation
+- Slightly outside ideal size range
+- Some ambiguity in success criteria
+- Minor setup complexity
+
+**50-69**: Marginally feasible
+- Verification would be challenging
+- Size at the edges (very small or quite large)
+- Success criteria unclear in places
+- Some environment complexity
+
+**30-49**: Difficult to evaluate
+- Verification very challenging or subjective
+- Inappropriate size for benchmark
+- Mostly subjective success criteria
+- Complex environment or dependencies
+
+**0-29**: Not feasible
+- No clear verification method
+- Way too large (1000+ lines) or trivially small (< 10 lines)
+- Requires external services or paid APIs
+- Cannot be evaluated objectively
+
+---
+
+Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;
+
+export function createUserPrompt(context: PrEvalContext): string {
+  const filesPreview = context.files
+    .slice(0, 20)
+    .map((f) => `  - ${f.filename} (${f.changes} changes)`)
+    .join("\n");
+
+  return `Evaluate this Pull Request for EVALUATION FEASIBILITY as a benchmark task.
+
+## PR Information
+
+**Title:** ${context.title}
+**Repository:** ${context.owner}/${context.repo}
+
+## Statistics
+
+- Files changed: ${context.diffStats.filesChanged}
+- Lines added: ${context.diffStats.additions}
+- Lines deleted: ${context.diffStats.deletions}
+- **Total line changes: ${context.diffStats.totalLines}**
+- Contains test files: ${context.hasTests ? "Yes" : "No"}
+
+## Files Changed
+
+${filesPreview}
+${context.files.length > 20 ? `... and ${context.files.length - 20} more files` : ""}
+
+## PR Description
+
+${context.body || "(No description provided)"}
+
+## Diff Size Assessment
+
+The diff is ${context.diffStats.totalLines} lines total.
+${context.diffTruncated ? "Note: The full diff was truncated due to size (>50K chars), indicating a very large PR." : ""}
+
+## Key Questions to Answer
+
+1. **Size appropriateness**: Is ${context.diffStats.totalLines} lines a reasonable size for a benchmark task?
+2. **Verification method**: Can we verify correctness through tests, builds, or diff comparison?
+3. **Environment needs**: Does this require any special setup, external services, or paid APIs?
+4. **Time feasibility**: Can an AI agent reasonably complete this in under 30 minutes?
+
+Based on the above, evaluate the EVALUATION FEASIBILITY of using this PR as a benchmark task.`;
+}