diff --git a/.github/workflows/evaluate-pr-candidate.yml b/.github/workflows/evaluate-pr-candidate.yml
new file mode 100644
index 0000000..e2ca8da
--- /dev/null
+++ b/.github/workflows/evaluate-pr-candidate.yml
@@ -0,0 +1,64 @@
+name: Evaluate PR Candidate
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    # Only run if issue body contains a GitHub PR URL
+    if: contains(github.event.issue.body, 'github.com') && contains(github.event.issue.body, '/pull/')
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install
+
+      - name: Extract PR URL from issue
+        id: extract
+        run: |
+          # Extract first PR URL from issue body
+          PR_URL=$(echo '${{ github.event.issue.body }}' | grep -oE 'https://github\.com/[^/]+/[^/]+/pull/[0-9]+' | head -1)
+          echo "pr_url=${PR_URL}" >> $GITHUB_OUTPUT
+          if [ -z "$PR_URL" ]; then
+            echo "No valid PR URL found"
+            echo "found=false" >> $GITHUB_OUTPUT
+          else
+            echo "Found PR URL: $PR_URL"
+            echo "found=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Evaluate PR candidate
+        if: steps.extract.outputs.found == 'true'
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_URL: ${{ steps.extract.outputs.pr_url }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          REPO_OWNER: ${{ github.repository_owner }}
+          REPO_NAME: ${{ github.event.repository.name }}
+        run: bun github/evaluate-pr.ts
+
+      - name: Handle missing PR URL
+        if: steps.extract.outputs.found == 'false'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh issue comment ${{ github.event.issue.number }} --body "## Invalid Submission
+
+          No valid GitHub PR URL was found in the issue body.
+
+          Please provide a PR URL in the format:
+          \`\`\`
+          https://github.com/owner/repo/pull/123
+          \`\`\`
+
+          Then create a new issue with the correct URL."
diff --git a/cli.ts b/cli.ts
index 54731c2..b04f4ce 100644
--- a/cli.ts
+++ b/cli.ts
@@ -8,6 +8,8 @@ import { Task } from "~/src/tasks/index.js";
 import { Summarizer } from "~/src/summarizer.js";
 import { Logger } from "~/src/util/logger.js";
 import { Eval } from "./src/eval.js";
+import { PrEval } from "./src/pr-eval/index.js";
+import { Reporter } from "./src/pr-eval/reporter.js";
 
 const cli = yargs(hideBin(process.argv))
   .scriptName("orvl")
@@ -33,6 +35,60 @@ cli.command(
   },
 );
 
+cli.command(
+  "evaluate-pr <url>",
+  "Evaluate a PR as a benchmark candidate",
+  async (yargs) =>
+    yargs
+      .positional("url", {
+        type: "string",
+        description: "GitHub PR URL (e.g., https://github.com/owner/repo/pull/123)",
+        required: true,
+      })
+      .option("output", {
+        type: "string",
+        description: "Output file path for JSON results",
+      })
+      .example([
+        ["orvl evaluate-pr https://github.com/owner/repo/pull/123"],
+        ["orvl evaluate-pr https://github.com/owner/repo/pull/123 --output result.json"],
+      ]),
+  async ({ url, output }) => {
+    if (!url) throw new Error("PR URL is required");
+
+    const logger = Logger.create("[pr-eval]");
+
+    const result = await PrEval.evaluate(url, { logger });
+
+    // Print summary
+    logger.log("");
+    logger.log("=".repeat(60));
+    logger.log(`Final Score: ${result.finalScore.toFixed(1)}/100`);
+    logger.log(`Recommendation: ${result.recommendation.toUpperCase()}`);
+    logger.log("=".repeat(60));
+    logger.log("");
+
+    result.criteria.forEach((c) => {
+      const consensus = PrEval.getConsensusLevel(c.variance);
+      logger.log(`${c.displayName}: ${c.average.toFixed(0)}/100 (${consensus} consensus)`);
+      c.judges.forEach((j) => {
+        logger.log(`  - ${j.judge}: ${j.score}/100`);
+      });
+    });
+
+    if (output) {
+      await writeFile(output, JSON.stringify(result, null, 2));
+      logger.log(`\nResults saved to ${output}`);
+    }
+
+    // Also print the formatted comment
+    logger.log("\n" + "=".repeat(60));
+    logger.log("FORMATTED COMMENT PREVIEW:");
+    logger.log("=".repeat(60) + "\n");
+    logger.log(Reporter.formatComment(result));
+  },
+);
+
 cli.command(
   "$0 [agent]",
   "Run benchmark",
diff --git a/github/evaluate-pr.ts b/github/evaluate-pr.ts
new file mode 100644
index 0000000..2434bd3
--- /dev/null
+++ b/github/evaluate-pr.ts
@@ -0,0 +1,82 @@
+#!/usr/bin/env bun
+import { Logger } from "../src/util/logger.js";
+import { PrEval } from "../src/pr-eval/index.js";
+import { Reporter } from "../src/pr-eval/reporter.js";
+import { addIssueComment, addIssueLabels } from "../src/util/github.js";
+
+const prUrl = process.env.PR_URL;
+const issueNumber = process.env.ISSUE_NUMBER;
+const repoOwner = process.env.REPO_OWNER;
+const repoName = process.env.REPO_NAME;
+
+if (!prUrl) {
+  console.error("PR_URL environment variable is required");
+  process.exit(1);
+}
+
+if (!issueNumber) {
+  console.error("ISSUE_NUMBER environment variable is required");
+  process.exit(1);
+}
+
+if (!repoOwner || !repoName) {
+  console.error("REPO_OWNER and REPO_NAME environment variables are required");
+  process.exit(1);
+}
+
+const issueNum = parseInt(issueNumber, 10);
+if (isNaN(issueNum)) {
+  console.error("ISSUE_NUMBER must be a valid number");
+  process.exit(1);
+}
+
+const logger = Logger.create("[pr-eval]");
+
+async function main() {
+  logger.log(`Evaluating PR: ${prUrl}`);
+  logger.log(`Will post results to issue #${issueNum}`);
+
+  try {
+    const result = await PrEval.evaluate(prUrl, { logger });
+
+    const comment = Reporter.formatComment(result);
+    const labels = Reporter.getLabels(result);
+
+    logger.log(`Posting comment to ${repoOwner}/${repoName}#${issueNum}...`);
+    await addIssueComment(repoOwner, repoName, issueNum, comment);
+
+    logger.log(`Adding labels: ${labels.join(", ")}...`);
+    await addIssueLabels(repoOwner, repoName, issueNum, labels);
+
+    logger.log(`Evaluation complete: ${result.recommendation}`);
+    logger.log(`Final score: ${result.finalScore.toFixed(1)}/100`);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    logger.error(`Evaluation failed: ${message}`);
+
+    const errorComment = `## Evaluation Failed
+
+Unable to evaluate PR candidate:
+
+\`\`\`
+${message}
+\`\`\`
+
+Please check that:
+- The PR URL is valid and the repository is public
+- The PR exists and is accessible
+
+If the issue persists, please contact a maintainer.`;
+
+    try {
+      await addIssueComment(repoOwner, repoName, issueNum, errorComment);
+      await addIssueLabels(repoOwner, repoName, issueNum, ["benchmark-evaluation-failed"]);
+    } catch (commentError) {
+      logger.error(`Failed to post error comment: ${commentError}`);
+    }
+
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/src/pr-eval/criteria/eval-feasibility.ts b/src/pr-eval/criteria/eval-feasibility.ts
new file mode 100644
index 0000000..a9361d1
--- /dev/null
+++ b/src/pr-eval/criteria/eval-feasibility.ts
@@ -0,0 +1,138 @@
+import type { PrEvalContext } from "../fetcher.js";
+
+export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be FEASIBLY EVALUATED as a benchmark task.
+
+**YOUR ROLE**: Determine if an AI agent's attempt to reproduce this PR can be objectively scored.
+
+IMPORTANT: You must give a score from 0-100. Be strict but fair.
+
+---
+
+## WHAT TO EVALUATE
+
+### Diff Size & Complexity:
+1. **Appropriate diff size**
+   - Not too small (< 20 lines) - trivially simple
+   - Not too large (> 1000 lines) - unmanageable for evaluation
+   - Ideal: 50-500 lines of meaningful changes
+
+2. **Complexity balance**
+   - Complex enough to be a meaningful challenge
+   - Not so complex that evaluation becomes ambiguous
+   - Should require thought, not just transcription
+
+### Deterministic Verification:
+1. **Objective success criteria**
+   - Can success be measured programmatically?
+   - Are there clear pass/fail conditions?
+   - Can we run automated checks (tests, linting, builds)?
+
+2. **Test-based verification**
+   - Existing tests that must continue passing
+   - New tests that verify the specific change
+   - Build/lint checks that must succeed
+
+3. **Diff-based verification**
+   - Key code patterns identifiable in expected output
+   - Logic equivalence can be assessed
+   - Not purely stylistic where any approach works
+
+### Practical Constraints:
+1. **Environment requirements**
+   - No special hardware needed (GPU, specific OS)
+   - No paid API keys required for testing
+   - Standard development environment sufficient
+
+2. **Time constraints**
+   - Can be completed in reasonable time (< 30 min agent runtime)
+   - No long-running processes required for verification
+   - Dependencies can be installed quickly
+
+3. **External dependencies**
+   - No external services needed during evaluation
+   - Self-contained within the repository
+   - No authentication to external systems
+
+---
+
+## SCORING RUBRIC
+
+**90-100**: Highly feasible
+- Clear verification via automated tests
+- Appropriate diff size (100-500 lines)
+- Deterministic success criteria
+- Standard environment, no special requirements
+- Quick setup and verification
+
+**70-89**: Feasible with minor challenges
+- Verification possible but requires some interpretation
+- Slightly outside ideal size range
+- Some ambiguity in success criteria
+- Minor setup complexity
+
+**50-69**: Marginally feasible
+- Verification would be challenging
+- Size at the edges (very small or quite large)
+- Success criteria unclear in places
+- Some environment complexity
+
+**30-49**: Difficult to evaluate
+- Verification very challenging or subjective
+- Inappropriate size for benchmark
+- Mostly subjective success criteria
+- Complex environment or dependencies
+
+**0-29**: Not feasible
+- No clear verification method
+- Way too large (1000+ lines) or trivially small (< 10 lines)
+- Requires external services or paid APIs
+- Cannot be evaluated objectively
+
+---
+
+Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;
+
+export function createUserPrompt(context: PrEvalContext): string {
+  const filesPreview = context.files
+    .slice(0, 20)
+    .map((f) => `  - ${f.filename} (${f.changes} changes)`)
+    .join("\n");
+
+  return `Evaluate this Pull Request for EVALUATION FEASIBILITY as a benchmark task.
+
+## PR Information
+
+**Title:** ${context.title}
+**Repository:** ${context.owner}/${context.repo}
+
+## Statistics
+
+- Files changed: ${context.diffStats.filesChanged}
+- Lines added: ${context.diffStats.additions}
+- Lines deleted: ${context.diffStats.deletions}
+- **Total line changes: ${context.diffStats.totalLines}**
+- Contains test files: ${context.hasTests ? "Yes" : "No"}
+
+## Files Changed
+
+${filesPreview}
+${context.files.length > 20 ? `... and ${context.files.length - 20} more files` : ""}
+
+## PR Description
+
+${context.body || "(No description provided)"}
+
+## Diff Size Assessment
+
+The diff is ${context.diffStats.totalLines} lines total.
+${context.diffTruncated ? "Note: The full diff was truncated due to size (>50K chars), indicating a very large PR." : ""}
+
+## Key Questions to Answer
+
+1. **Size appropriateness**: Is ${context.diffStats.totalLines} lines a reasonable size for a benchmark task?
+2. **Verification method**: Can we verify correctness through tests, builds, or diff comparison?
+3. **Environment needs**: Does this require any special setup, external services, or paid APIs?
+4. **Time feasibility**: Can an AI agent reasonably complete this in under 30 minutes?
+
+Based on the above, evaluate the EVALUATION FEASIBILITY of using this PR as a benchmark task.`;
+}
diff --git a/src/pr-eval/criteria/index.ts b/src/pr-eval/criteria/index.ts
new file mode 100644
index 0000000..6085109
--- /dev/null
+++ b/src/pr-eval/criteria/index.ts
@@ -0,0 +1,43 @@
+import type { PrEvalContext } from "../fetcher.js";
+import * as scopeClarity from "./scope-clarity.js";
+import * as technicalQuality from "./technical-quality.js";
+import * as evalFeasibility from "./eval-feasibility.js";
+import * as reproducibility from "./reproducibility.js";
+
+export interface Criterion {
+  systemPrompt: string;
+  createUserPrompt: (context: PrEvalContext) => string;
+}
+
+export interface CriterionConfig {
+  criterion: Criterion;
+  weight: number;
+  displayName: string;
+}
+
+export namespace PrCriteria {
+  export const all: Record<string, CriterionConfig> = {
+    "scope-clarity": {
+      criterion: scopeClarity,
+      weight: 0.25,
+      displayName: "Scope & Clarity",
+    },
+    "technical-quality": {
+      criterion: technicalQuality,
+      weight: 0.25,
+      displayName: "Technical Quality",
+    },
+    "eval-feasibility": {
+      criterion: evalFeasibility,
+      weight: 0.25,
+      displayName: "Evaluation Feasibility",
+    },
+    reproducibility: {
+      criterion: reproducibility,
+      weight: 0.25,
+      displayName: "Reproducibility",
+    },
+  };
+
+  export const names = Object.keys(all) as Array<keyof typeof all>;
+}
diff --git a/src/pr-eval/criteria/reproducibility.ts b/src/pr-eval/criteria/reproducibility.ts
new file mode 100644
index 0000000..6d3c6bd
--- /dev/null
+++ b/src/pr-eval/criteria/reproducibility.ts
@@ -0,0 +1,157 @@
+import type { PrEvalContext } from "../fetcher.js";
+
+export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be REPRODUCED as a benchmark task.
+
+**YOUR ROLE**: Determine if this PR can be converted into a clear, reproducible coding task that an AI agent could attempt.
+
+IMPORTANT: You must give a score from 0-100. Be strict but fair.
+
+---
+
+## WHAT TO EVALUATE
+
+### Task Derivability:
+1. **Clear starting point**
+   - Base commit is identifiable (the PR base branch)
+   - Repository state is reproducible
+   - No hidden dependencies or uncommitted prerequisites
+
+2. **Prompt generation potential**
+   - Intent can be described without revealing exact implementation
+   - Task can be phrased as a natural developer request
+   - Sufficient context available from PR title, description, and commits
+   - The "what" is clear even if exact "how" varies
+
+3. **Expected outcome clarity**
+   - Target state is well-defined
+   - Acceptance criteria can be derived from the PR
+   - Multiple valid implementations could potentially exist
+   - Success is about behavior, not exact code match
+
+### External Dependencies:
+1. **Repository accessibility**
+   - Public repository accessible without authentication
+   - No private dependencies or internal packages
+   - No proprietary tools required
+
+2. **Environment reproducibility**
+   - Standard language/framework versions
+   - Dependencies are installable via package managers
+   - No proprietary or licensed software required
+
+3. **Data requirements**
+   - No external datasets needed
+   - No API calls to external services required for the task
+   - Self-contained within the codebase
+
+### Real-world Task Characteristics:
+1. **Natural task framing**
+   - Could be a real developer request from a PM or lead
+   - Not artificially constructed or contrived
+   - Represents genuine development work
+
+2. **Documentation sufficiency**
+   - Enough context to understand the goal from PR alone
+   - Not requiring deep institutional/tribal knowledge
+   - Reasonable learning curve for understanding the codebase area
+
+3. **Isolation**
+   - Changes don't depend on simultaneous other PRs
+   - Can be applied cleanly to base branch
+   - No merge conflict complexity
+
+---
+
+## SCORING RUBRIC
+
+**90-100**: Highly reproducible
+- Clear base commit and branch
+- Excellent PR description explaining intent
+- No external dependencies
+- Natural task that could be real developer work
+- Standard, accessible environment
+- Self-contained changes
+
+**70-89**: Reproducible with minor effort
+- Good starting point identified
+- Adequate description, some gaps
+- Minimal external requirements
+- Reasonable task framing
+- Minor setup complexity
+
+**50-69**: Reproducible with effort
+- Starting point needs some clarification
+- Limited description or context
+- Some setup complexity
+- Task framing somewhat unusual
+- May need domain knowledge
+
+**30-49**: Difficult to reproduce
+- Unclear starting point or prerequisites
+- Poor or no documentation
+- Complex dependencies or setup
+- Artificial or contrived feel
+- Heavy domain knowledge required
+
+**0-29**: Not reproducible
+- No clear starting state
+- No documentation of intent
+- Heavy external dependencies
+- Cannot be converted to standalone task
+- Private or inaccessible resources required
+
+---
+
+Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;
+
+export function createUserPrompt(context: PrEvalContext): string {
+  const commitsPreview = context.commits
+    .slice(0, 10)
+    .map((c) => `  - ${c.message.split("\n")[0]}`)
+    .join("\n");
+
+  const filesPreview = context.files
+    .slice(0, 15)
+    .map((f) => `  - ${f.filename}`)
+    .join("\n");
+
+  return `Evaluate this Pull Request for REPRODUCIBILITY as a benchmark task.
+
+## PR Information
+
+**Title:** ${context.title}
+**Repository:** ${context.owner}/${context.repo}
+**Base Branch:** ${context.baseBranch}
+**Head Branch:** ${context.headBranch}
+**PR Number:** #${context.prNumber}
+
+## PR Description
+
+${context.body || "(No description provided)"}
+
+## Commit History
+
+${commitsPreview}
+${context.commits.length > 10 ? `... and ${context.commits.length - 10} more commits` : ""}
+
+## Files Modified
+
+${filesPreview}
+${context.files.length > 15 ? `... and ${context.files.length - 15} more files` : ""}
+
+## Statistics
+
+- Files changed: ${context.diffStats.filesChanged}
+- Lines added: ${context.diffStats.additions}
+- Lines deleted: ${context.diffStats.deletions}
+
+## Key Questions to Answer
+
+1. **Starting point**: Can we clearly identify where an agent should start (base commit)?
+2. **Task description**: Can we derive a natural task prompt from this PR's description and commits?
+3. **Dependencies**: Does this require external services, private packages, or special access?
+4. **Environment**: Can this be run in a standard development environment?
+5. **Clarity**: Would a developer understand what to build from the PR description alone?
+
+Based on the above, evaluate the REPRODUCIBILITY of converting this PR into a benchmark task.`;
+}
diff --git a/src/pr-eval/criteria/scope-clarity.ts b/src/pr-eval/criteria/scope-clarity.ts
new file mode 100644
index 0000000..2981a7b
--- /dev/null
+++ b/src/pr-eval/criteria/scope-clarity.ts
@@ -0,0 +1,139 @@
+import type { PrEvalContext } from "../fetcher.js";
+
+export const systemPrompt = `You are evaluating whether a GitHub Pull Request is suitable as a benchmark task based on SCOPE and CLARITY.
+
+**YOUR ROLE**: Determine if this PR has clear, self-contained changes that could be reproduced as a coding task for evaluating AI coding agents.
+
+IMPORTANT: You must give a score from 0-100. Be strict but fair.
+
+---
+
+## WHAT TO EVALUATE
+
+### Scope Assessment:
+1. **Self-containment** - Are the changes isolated and independent?
+   - Does NOT depend on external PRs or uncommitted changes
+   - Changes are localized to related files
+   - No sprawling changes across unrelated modules
+
+2. **Appropriate size** - Is the scope meaningful but not overwhelming?
+   - Too small: Typo fixes, single-line config changes, version bumps
+   - Too large: 50+ files, multiple unrelated features bundled together
+   - Ideal: 3-30 files, single cohesive feature or fix
+
+3. **Focused intent** - Does it solve ONE clear problem?
+   - Single feature addition
+   - Single bug fix
+   - Single refactoring goal
+   - NOT multiple unrelated changes bundled together
+
+### Clarity Assessment:
+1. **PR description quality** - Is the intent clear?
+   - Explains WHAT is being changed
+   - Explains WHY it's needed
+   - Has clear acceptance criteria (explicit or implicit)
+
+2. **Commit message quality** - Do commits tell a story?
+   - Meaningful commit messages
+   - Logical commit progression
+   - Not just "fix" or "update"
+
+3. **Code readability** - Can an AI agent understand the goal?
+   - Changes are understandable without deep domain knowledge
+   - Intent is clear from the diff itself
+   - Not overly complex or cryptic
+
+---
+
+## SCORING RUBRIC
+
+**90-100**: Excellent benchmark candidate
+- Single, clear purpose evident from title and description
+- Well-documented PR with context
+- 3-20 files changed with focused changes
+- Self-contained with no external dependencies
+- Clear success criteria derivable from the PR
+
+**70-89**: Good candidate with minor issues
+- Clear purpose but could be better documented
+- Slightly too large or too small
+- Minor scope creep (1-2 unrelated changes)
+- Most context is clear
+
+**50-69**: Marginal candidate
+- Purpose somewhat unclear
+- Moderate scope issues (too broad or too narrow)
+- Would need significant context to reproduce
+- Description is sparse or confusing
+
+**30-49**: Poor candidate
+- Unclear purpose
+- Too large (50+ files) or too fragmented
+- Hard to understand intent from PR alone
+- Multiple unrelated changes bundled
+
+**0-29**: Not suitable
+- No clear purpose or description
+- Massive scope or trivially small
+- Impossible to derive clear task
+- Depends on external context unavailable
+
+---
+
+Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;
+
+export function createUserPrompt(context: PrEvalContext): string {
+  const filesPreview = context.files
+    .slice(0, 30)
+    .map((f) => `  - ${f.filename} (${f.status}, ${f.changes} changes)`)
+    .join("\n");
+
+  const commitsPreview = context.commits
+    .slice(0, 10)
+    .map((c) => `  - ${c.message.split("\n")[0]}`)
+    .join("\n");
+
+  const diffPreview =
+    context.diff.length > 5000
+      ? context.diff.slice(0, 5000) + "\n... [diff truncated for evaluation]"
+      : context.diff;
+
+  return `Evaluate this Pull Request for SCOPE and CLARITY as a benchmark task candidate.
+
+## PR Information
+
+**Title:** ${context.title}
+**Repository:** ${context.owner}/${context.repo}
+**PR Number:** #${context.prNumber}
+**Base Branch:** ${context.baseBranch} <- ${context.headBranch}
+
+## PR Description
+
+${context.body || "(No description provided)"}
+
+## Statistics
+
+- Files changed: ${context.diffStats.filesChanged}
+- Lines added: ${context.diffStats.additions}
+- Lines deleted: ${context.diffStats.deletions}
+- Total commits: ${context.commits.length}
+
+## Files Changed
+
+${filesPreview}
+${context.files.length > 30 ? `... and ${context.files.length - 30} more files` : ""}
+
+## Commit Messages
+
+${commitsPreview}
+${context.commits.length > 10 ? `... and ${context.commits.length - 10} more commits` : ""}
+
+## Diff Preview
+
+\`\`\`diff
+${diffPreview}
+\`\`\`
+${context.diffTruncated ? "\n(Note: Diff was truncated due to size)" : ""}
+
+Based on the above, evaluate the SCOPE and CLARITY of this PR for use as a benchmark task.`;
+}
diff --git a/src/pr-eval/criteria/technical-quality.ts b/src/pr-eval/criteria/technical-quality.ts
new file mode 100644
index 0000000..2666b41
--- /dev/null
+++ b/src/pr-eval/criteria/technical-quality.ts
@@ -0,0 +1,129 @@
+import type { PrEvalContext } from "../fetcher.js";
+
+export const systemPrompt = `You are evaluating whether a GitHub Pull Request demonstrates TECHNICAL QUALITY suitable for a coding benchmark.
+
+**YOUR ROLE**: Assess if this PR touches meaningful code and has proper testing, making it suitable for evaluating AI coding agents.
+
+IMPORTANT: You must give a score from 0-100. Be strict but fair.
+
+---
+
+## WHAT TO EVALUATE
+
+### Code Meaningfulness:
+1. **Substance over style**
+   - NOT just formatting/linting changes
+   - NOT just dependency updates or version bumps
+   - Contains actual logic changes (conditionals, functions, data flow)
+
+2. **Technical depth**
+   - Involves decision-making (conditionals, algorithms, data structures)
+   - Requires understanding of the codebase patterns
+   - Has technical challenge (not just boilerplate)
+
+3. **Real-world relevance**
+   - Represents actual production work
+   - Not a toy example or demo
+   - Solves a real problem or adds real functionality
+
+### Test Infrastructure:
+1. **Test coverage**
+   - Adds or modifies tests alongside code changes
+   - Tests are meaningful (not just mocks or stubs)
+   - Tests verify the actual behavioral change
+
+2. **Test executability**
+   - Tests can be run independently
+   - Clear test framework in use (pytest, jest, go test, etc.)
+   - Tests don't require complex external setup
+
+3. **Verification potential**
+   - Changes can be verified programmatically
+   - Success criteria is testable (not subjective)
+   - Can determine pass/fail objectively
+
+---
+
+## SCORING RUBRIC
+
+**90-100**: Excellent technical quality
+- Significant logic changes with clear technical depth
+- Comprehensive test additions or modifications
+- Clear verification path through automated tests
+- Production-ready, non-trivial implementation
+
+**70-89**: Good technical quality
+- Meaningful code changes with some complexity
+- Some test coverage included
+- Reasonable verification possible
+- Real functionality added/modified
+
+**50-69**: Moderate technical quality
+- Some meaningful changes mixed with trivial ones
+- Limited or no test coverage
+- Verification might be challenging
+- Borderline complexity
+
+**30-49**: Limited technical quality
+- Mostly trivial or cosmetic changes
+- Little to no test coverage
+- Hard to verify correctness
+- Minimal technical challenge
+
+**0-29**: Poor technical quality
+- Only formatting, config, or dependency changes
+- No tests whatsoever
+- No clear verification method
+- No real logic changes
+
+---
+
+Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;
+
+export function createUserPrompt(context: PrEvalContext): string {
+  const filesPreview = context.files
+    .slice(0, 30)
+    .map((f) => `  - ${f.filename} (${f.status}, ${f.changes} changes)`)
+    .join("\n");
+
+  const diffPreview =
+    context.diff.length > 8000
+      ? context.diff.slice(0, 8000) + "\n... [diff truncated for evaluation]"
+      : context.diff;
+
+  return `Evaluate this Pull Request for TECHNICAL QUALITY as a benchmark task candidate.
+
+## PR Information
+
+**Title:** ${context.title}
+**Repository:** ${context.owner}/${context.repo}
+
+## PR Description
+
+${context.body || "(No description provided)"}
+
+## Statistics
+
+- Files changed: ${context.diffStats.filesChanged}
+- Lines added: ${context.diffStats.additions}
+- Lines deleted: ${context.diffStats.deletions}
+- Contains test files: ${context.hasTests ? "Yes" : "No"}
+
+## Files Changed
+
+${filesPreview}
+${context.files.length > 30 ? `... and ${context.files.length - 30} more files` : ""}
+
+## Full Diff
+
+\`\`\`diff
+${diffPreview}
+\`\`\`
+${context.diffTruncated ? "\n(Note: Diff was truncated due to size)" : ""}
+
+Based on the above, evaluate the TECHNICAL QUALITY of this PR:
+1. Does it contain meaningful logic changes (not just formatting/config)?
+2. Does it include or modify tests?
+3. Can the changes be verified programmatically?
+4. Is there sufficient technical depth for a benchmark task?`;
+}
diff --git a/src/pr-eval/fetcher.ts b/src/pr-eval/fetcher.ts
new file mode 100644
index 0000000..6530188
--- /dev/null
+++ b/src/pr-eval/fetcher.ts
@@ -0,0 +1,103 @@
+import { fetchPullRequest, type PullRequestData } from "../util/github.js";
+import { parsePrUrl } from "./parser.js";
+
+const MAX_DIFF_LENGTH = 50000;
+
+export interface PrEvalContext {
+  url: string;
+  owner: string;
+  repo: string;
+  prNumber: number;
+  title: string;
+  body: string;
+  diff: string;
+  diffTruncated: boolean;
+  diffStats: {
+    filesChanged: number;
+    additions: number;
+    deletions: number;
+    totalLines: number;
+  };
+  commits: Array<{ sha: string; message: string }>;
+  files: Array<{
+    filename: string;
+    status: string;
+    changes: number;
+  }>;
+  hasTests: boolean;
+  baseBranch: string;
+  headBranch: string;
+}
+
+const TEST_FILE_PATTERNS = [
+  /test[s]?\//i,
+  /spec[s]?\//i,
+  /__tests__\//i,
+  /\.test\.[jt]sx?$/i,
+  /\.spec\.[jt]sx?$/i,
+  /_test\.go$/i,
+  /_test\.py$/i,
+  /test_.*\.py$/i,
+  /\.test\.rs$/i,
+];
+
+function isTestFile(filename: string): boolean {
+  return TEST_FILE_PATTERNS.some((pattern) => pattern.test(filename));
+}
+
+export async function fetchPrContext(prUrl: string): Promise<PrEvalContext> {
+  const parsed = parsePrUrl(prUrl);
+  if (!parsed) {
+    throw new Error(`Invalid GitHub PR URL: ${prUrl}`);
+  }
+
+  const { owner, repo, prNumber } = parsed;
+
+  let prData: PullRequestData;
+  try {
+    prData = await fetchPullRequest(owner, repo, prNumber);
+  } catch (error) {
+    if (error instanceof Error && error.message.includes("404")) {
+      throw new Error(
+        `PR not found or not accessible: ${owner}/${repo}#${prNumber}. ` +
+          `Make sure the repository is public and the PR exists.`,
+      );
+    }
+    throw error;
+  }
+
+  let diff = prData.diff;
+  let diffTruncated = false;
+  if (diff.length > MAX_DIFF_LENGTH) {
+    diff = diff.slice(0, MAX_DIFF_LENGTH);
+    diffTruncated = true;
+  }
+
+  const hasTests = prData.files.some((f) => isTestFile(f.filename));
+
+  return {
+    url: prUrl,
+    owner,
+    repo,
+    prNumber,
+    title: prData.title,
+    body: prData.body,
+    diff,
+    diffTruncated,
+    diffStats: {
+      filesChanged: prData.changedFiles,
+      additions: prData.additions,
+      deletions: prData.deletions,
+      totalLines: prData.additions + prData.deletions,
+    },
+    commits: prData.commitMessages,
+    files: prData.files.map((f) => ({
+      filename: f.filename,
+      status: f.status,
+      changes: f.changes,
+    })),
+    hasTests,
+    baseBranch: prData.baseBranch,
+    headBranch: prData.headBranch,
+  };
+}
diff --git a/src/pr-eval/index.ts b/src/pr-eval/index.ts
new file mode 100644
index 0000000..50abec3
--- /dev/null
+++ b/src/pr-eval/index.ts
@@ -0,0 +1,180 @@
+import { z } from "zod";
+import { generateObject } from "ai";
+import { Logger } from "../util/logger.js";
+import { Judge } from "../judges.js";
+import { getZenLanguageModel } from "../zenModels.js";
+import { average, variance, weightedSum } from "../util/math.js";
+import { fetchPrContext, type PrEvalContext } from "./fetcher.js";
+import { PrCriteria } from "./criteria/index.js";
+
+export namespace PrEval {
+  export const DISAGREEMENT_PENALTY = 0.5;
+
+  export interface JudgeScore {
+    judge: string;
+    score: number;
+    rationale: string;
+  }
+
+  export interface CriterionResult {
+    criterion: string;
+    displayName: string;
+    weight: number;
+    average: number;
+    variance: number;
+    judges: JudgeScore[];
+  }
+
+  export type Recommendation = "approved" | "rejected" | "needs-review";
+
+  export interface EvaluationResult {
+    prUrl: string;
+    owner: string;
+    repo: string;
+    prNumber: number;
+    finalScore: number;
+    baseScore: number;
+    penalty: number;
+    recommendation: Recommendation;
+    criteria: CriterionResult[];
+    evaluatedAt: string;
+  }
+
+  function getRecommendation(score: number): Recommendation {
+    if (score >= 70) return "approved";
+    if (score >= 50) return "needs-review";
+    return "rejected";
+  }
+
+  export function getConsensusLevel(variance: number): "high" | "medium" | "low" {
+    if (variance < 100) return "high";
+    if (variance < 400) return "medium";
+    return "low";
+  }
+
+  export async function evaluate(
+    prUrl: string,
+    opts: { logger: Logger.Instance },
+  ): Promise<EvaluationResult> {
+    opts.logger.log(`Fetching PR data from ${prUrl}...`);
+    const context = await fetchPrContext(prUrl);
+
+    opts.logger.log(
+      `PR: ${context.owner}/${context.repo}#${context.prNumber} - "${context.title}"`,
+    );
+    opts.logger.log(
+      `Stats: ${context.diffStats.filesChanged} files, +${context.diffStats.additions}/-${context.diffStats.deletions} lines`,
+    );
+
+    const allScores: CriterionResult[] = [];
+
+    for (const criterionName of PrCriteria.names) {
+      const config = PrCriteria.all[criterionName];
+      const cl = opts.logger.child(`[${config.displayName}]`);
+
+      cl.log("Evaluating...");
+
+      const scores: JudgeScore[] = [];
+      for (const judge of Judge.all) {
+        const jl = cl.child(`[${judge}]`);
+        jl.log("Judging...");
+
+        try {
+          const result = await judgeScore(
+            config.criterion.systemPrompt,
+            config.criterion.createUserPrompt(context),
+            judge,
+            { logger: jl },
+          );
+          scores.push({ judge, ...result });
+          jl.log(`Score: ${result.score}/100`);
+        } catch (e) {
+          const msg = e instanceof Error ? e.message : String(e);
+          jl.error(`Failed: ${msg}`);
+          scores.push({ judge, score: 0, rationale: `Error: ${msg}` });
+        }
+      }
+
+      const avg = average(scores.map((s) => s.score));
+      const vrc = variance(
+        avg,
+        scores.map((s) => s.score),
+      );
+
+      allScores.push({
+        criterion: criterionName,
+        displayName: config.displayName,
+        weight: config.weight,
+        average: avg,
+        variance: vrc,
+        judges: scores,
+      });
+
+      cl.log(`Average: ${avg.toFixed(1)}/100 (variance: ${vrc.toFixed(1)})`);
+    }
+
+    // Calculate weighted average (scores are 0-100)
+    const weightedAvg = weightedSum(
+      allScores.map(({ average, weight }) => ({ value: average, weight })),
+    );
+
+    // Calculate weighted variance for penalty
+    const weightedVrc = weightedSum(
+      allScores.map(({ variance, weight }) => ({ value: variance, weight })),
+    );
+
+    // Apply penalty (scaled for 0-100 range)
+    const penalty = DISAGREEMENT_PENALTY * Math.sqrt(weightedVrc);
+    const finalScore = Math.max(0, Math.min(100, weightedAvg - penalty));
+
+    opts.logger.log(`Final Score: ${finalScore.toFixed(1)}/100`);
+    opts.logger.log(`Recommendation: ${getRecommendation(finalScore)}`);
+
+    return {
+      prUrl,
+      owner: context.owner,
+      repo: context.repo,
+      prNumber: context.prNumber,
+      finalScore,
+      baseScore: weightedAvg,
+      penalty,
+      recommendation: getRecommendation(finalScore),
+      criteria: allScores,
+      evaluatedAt: new Date().toISOString(),
+    };
+  }
+
+  async function judgeScore(
+    systemPrompt: string,
+    userPrompt: string,
+    judge: string,
+    opts: { logger: Logger.Instance },
+  ): Promise<{ score: number; rationale: string }> {
+    const { object } = await generateObject({
+      model: getZenLanguageModel(judge),
+      schema: z.object({
+        score: z
+          .number()
+          .min(0)
+          .max(100)
+          .describe("Score from 0 to 100"),
+        rationale: z.string().min(1).describe("Explanation of the score"),
+      }),
+      system: systemPrompt,
+      temperature: 0,
+      prompt: userPrompt,
+    });
+
+    if (!object || typeof object !== "object") {
+      throw new Error("Judge must return an object.");
+    }
+    if (typeof object.score !== "number" || object.score < 0 || object.score > 100) {
+      throw new Error("Judge must return a score between 0 and 100.");
+    }
+    if (typeof object.rationale !== "string" || object.rationale.length === 0) {
+      throw new Error("Judge must include a rationale.");
+    }
+
+    return { score: object.score, rationale: object.rationale };
+  }
+}
diff --git a/src/pr-eval/parser.ts b/src/pr-eval/parser.ts
new file mode 100644
index 0000000..40c99b4
--- /dev/null
+++ b/src/pr-eval/parser.ts
@@ -0,0 +1,31 @@
+export interface ParsedPrUrl {
+  owner: string;
+  repo: string;
+  prNumber: number;
+}
+
+const PR_URL_REGEX = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)\/?$/;
+
+export function parsePrUrl(url: string): ParsedPrUrl | null {
+  const trimmed = url.trim();
+  const match = trimmed.match(PR_URL_REGEX);
+
+  if (!match) {
+    return null;
+  }
+
+  const [, owner, repo, prNumberStr] = match;
+  const prNumber = parseInt(prNumberStr, 10);
+
+  if (isNaN(prNumber) || prNumber <= 0) {
+    return null;
+  }
+
+  return { owner, repo, prNumber };
+}
+
+export function extractPrUrlFromText(text: string): string | null {
+  const regex = /https?:\/\/github\.com\/[^/]+\/[^/]+\/pull\/\d+/;
+  const match = text.match(regex);
+  return match ? match[0] : null;
+}
diff --git a/src/pr-eval/reporter.ts b/src/pr-eval/reporter.ts
new file mode 100644
index 0000000..6c9e7de
--- /dev/null
+++ b/src/pr-eval/reporter.ts
@@ -0,0 +1,129 @@
+import { PrEval } from "./index.js";
+
+export namespace Reporter {
+  const STATUS_ICONS = {
+    approved: ":white_check_mark:",
+    rejected: ":x:",
+    "needs-review": ":warning:",
+  } as const;
+
+  const STATUS_LABELS = {
+    approved: "Approved",
+    rejected: "Rejected",
+    "needs-review": "Needs Review",
+  } as const;
+
+  export function formatComment(result: PrEval.EvaluationResult): string {
+    const statusIcon = STATUS_ICONS[result.recommendation];
+    const statusLabel = STATUS_LABELS[result.recommendation];
+
+    const criteriaTable = result.criteria
+      .map((c) => {
+        const consensus = PrEval.getConsensusLevel(c.variance);
+        const consensusEmoji =
+          consensus === "high" ? ":green_circle:" : consensus === "medium" ? ":yellow_circle:" : ":red_circle:";
+        return `| ${c.displayName} | ${c.average.toFixed(0)}/100 | ${consensusEmoji} ${capitalize(consensus)} |`;
+      })
+      .join("\n");
+
+    const criteriaDetails = result.criteria
+      .map((c) => {
+        const judgeScores = c.judges
+          .map((j) => `- **${formatJudgeName(j.judge)}**: ${j.score}/100`)
+          .join("\n");
+
+        const combinedRationale = c.judges
+          .map((j) => `**${formatJudgeName(j.judge)}**: ${j.rationale}`)
+          .join("\n\n");
+
+        return `<details>
+<summary><b>${c.displayName} (${c.average.toFixed(0)}/100)</b></summary>
+
+### Judge Scores
+${judgeScores}
+
+### Rationale
+${combinedRationale}
+
+</details>`;
+      })
+      .join("\n\n");
+
+    return `## Benchmark Candidate Evaluation
+
+**PR:** [${result.owner}/${result.repo}#${result.prNumber}](${result.prUrl})
+**Status:** ${statusIcon} **${statusLabel}**
+**Final Score:** ${result.finalScore.toFixed(1)}/100
+
+---
+
+### Criterion Scores
+
+| Criterion | Score | Consensus |
+|-----------|-------|-----------|
+${criteriaTable}
+
+---
+
+### Detailed Analysis
+
+${criteriaDetails}
+
+---
+
+### Scoring Details
+
+- **Base Score:** ${result.baseScore.toFixed(1)}/100
+- **Disagreement Penalty:** -${result.penalty.toFixed(1)}
+- **Final Score:** ${result.finalScore.toFixed(1)}/100
+
+${getRecommendationMessage(result.recommendation)}
+
+---
+
+*Evaluated by [opencode-bench](https://github.com/sst/opencode-bench) on ${formatDate(result.evaluatedAt)}*
+*Judges: ${result.criteria[0]?.judges.map((j) => formatJudgeName(j.judge)).join(", ")}*`;
+  }
+
+  export function getLabels(result: PrEval.EvaluationResult): string[] {
+    const labels = ["benchmark-evaluation"];
+
+    switch (result.recommendation) {
+      case "approved":
+        labels.push("benchmark-approved");
+        break;
+      case "rejected":
+        labels.push("benchmark-rejected");
+        break;
+      case "needs-review":
+        labels.push("benchmark-needs-review");
+        break;
+    }
+
+    return labels;
+  }
+
+  function formatJudgeName(judge: string): string {
+    return judge.replace("opencode/", "").replace(/-/g, " ");
+  }
+
+  function capitalize(str: string): string {
+    return str.charAt(0).toUpperCase() + str.slice(1);
+  }
+
+  function formatDate(isoDate: string): string {
+    const date = new Date(isoDate);
+    return date.toISOString().split("T")[0];
+  }
+
+  function getRecommendationMessage(recommendation: PrEval.Recommendation): string {
+    switch (recommendation) {
+      case "approved":
+        return `> :white_check_mark: **This PR appears to be a good candidate for the benchmark.** A maintainer will review and may add it to the evaluation suite.`;
+      case "needs-review":
+        return `> :warning: **This PR may be suitable but requires manual review.** A maintainer will evaluate whether it meets benchmark requirements.`;
+      case "rejected":
+        return `> :x: **This PR does not appear suitable for the benchmark.** See the detailed analysis above for specific issues. You may submit a different PR or provide additional context.`;
+    }
+  }
+}
diff --git a/src/util/github.ts b/src/util/github.ts
index 467c8f7..041c906 100644
--- a/src/util/github.ts
+++ b/src/util/github.ts
@@ -123,3 +123,151 @@ export async function fetchCommits(
 
   return results.filter((value): value is CommitDiff => value !== null);
 }
+
+export interface PullRequestData {
+  number: number;
+  title: string;
+  body: string;
+  state: string;
+  baseBranch: string;
+  headBranch: string;
+  additions: number;
+  deletions: number;
+  changedFiles: number;
+  commits: number;
+  diff: string;
+  files: Array<{
+    filename: string;
+    status: string;
+    additions: number;
+    deletions: number;
+    changes: number;
+  }>;
+  commitMessages: Array<{ sha: string; message: string }>;
+}
+
+export async function fetchPullRequest(
+  owner: string,
+  repo: string,
+  prNumber: number,
+): Promise<PullRequestData> {
+  const client = getRequestClient();
+
+  // Fetch PR metadata
+  const prResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}", {
+    owner,
+    repo,
+    pull_number: prNumber,
+  });
+
+  const prData = prResponse.data as {
+    number: number;
+    title: string;
+    body: string | null;
+    state: string;
+    base: { ref: string };
+    head: { ref: string };
+    additions: number;
+    deletions: number;
+    changed_files: number;
+    commits: number;
+  };
+
+  // Fetch PR diff
+  const diffResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}", {
+    owner,
+    repo,
+    pull_number: prNumber,
+    headers: {
+      accept: DIFF_ACCEPT_HEADER,
+    },
+  });
+
+  const diff = String(diffResponse.data);
+
+  // Fetch PR files
+  const filesResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}/files", {
+    owner,
+    repo,
+    pull_number: prNumber,
+    per_page: 100,
+  });
+
+  const files = (filesResponse.data as Array<{
+    filename: string;
+    status: string;
+    additions: number;
+    deletions: number;
+    changes: number;
+  }>).map((f) => ({
+    filename: f.filename,
+    status: f.status,
+    additions: f.additions,
+    deletions: f.deletions,
+    changes: f.changes,
+  }));
+
+  // Fetch PR commits
+  const commitsResponse = await client("GET /repos/{owner}/{repo}/pulls/{pull_number}/commits", {
+    owner,
+    repo,
+    pull_number: prNumber,
+    per_page: 100,
+  });
+
+  const commitMessages = (commitsResponse.data as Array<{
+    sha: string;
+    commit: { message: string };
+  }>).map((c) => ({
+    sha: c.sha,
+    message: c.commit.message,
+  }));
+
+  return {
+    number: prData.number,
+    title: prData.title,
+    body: prData.body ?? "",
+    state: prData.state,
+    baseBranch: prData.base.ref,
+    headBranch: prData.head.ref,
+    additions: prData.additions,
+    deletions: prData.deletions,
+    changedFiles: prData.changed_files,
+    commits: prData.commits,
+    diff,
+    files,
+    commitMessages,
+  };
+}
+
+export async function addIssueComment(
+  owner: string,
+  repo: string,
+  issueNumber: number,
+  body: string,
+): Promise<void> {
+  const client = getRequestClient();
+
+  await client("POST /repos/{owner}/{repo}/issues/{issue_number}/comments", {
+    owner,
+    repo,
+    issue_number: issueNumber,
+    body,
+  });
+}
+
+export async function addIssueLabels(
+  owner: string,
+  repo: string,
+  issueNumber: number,
+  labels: string[],
+): Promise<void> {
+  const client = getRequestClient();
+
+  await client("POST /repos/{owner}/{repo}/issues/{issue_number}/labels", {
+    owner,
+    repo,
+    issue_number: issueNumber,
+    labels,
+  });
+}