Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/evaluate-pr-candidate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Evaluate PR Candidate

on:
issues:
types: [opened]

jobs:
evaluate:
runs-on: ubuntu-latest
# Only run if issue body contains a GitHub PR URL
if: contains(github.event.issue.body, 'github.com') && contains(github.event.issue.body, '/pull/')

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21

- name: Install dependencies
run: bun install

- name: Extract PR URL from issue
id: extract
run: |
# Extract first PR URL from issue body
PR_URL=$(echo '${{ github.event.issue.body }}' | grep -oE 'https://github\.com/[^/]+/[^/]+/pull/[0-9]+' | head -1)
echo "pr_url=${PR_URL}" >> $GITHUB_OUTPUT
if [ -z "$PR_URL" ]; then
echo "No valid PR URL found"
echo "found=false" >> $GITHUB_OUTPUT
else
echo "Found PR URL: $PR_URL"
echo "found=true" >> $GITHUB_OUTPUT
fi

- name: Evaluate PR candidate
if: steps.extract.outputs.found == 'true'
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_URL: ${{ steps.extract.outputs.pr_url }}
ISSUE_NUMBER: ${{ github.event.issue.number }}
REPO_OWNER: ${{ github.repository_owner }}
REPO_NAME: ${{ github.event.repository.name }}
run: bun github/evaluate-pr.ts

- name: Handle missing PR URL
if: steps.extract.outputs.found == 'false'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh issue comment ${{ github.event.issue.number }} --body "## Invalid Submission

No valid GitHub PR URL was found in the issue body.

Please provide a PR URL in the format:
\`\`\`
https://github.com/owner/repo/pull/123
\`\`\`

Then create a new issue with the correct URL."
56 changes: 56 additions & 0 deletions cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import { Task } from "~/src/tasks/index.js";
import { Summarizer } from "~/src/summarizer.js";
import { Logger } from "~/src/util/logger.js";
import { Eval } from "./src/eval.js";
import { PrEval } from "./src/pr-eval/index.js";
import { Reporter } from "./src/pr-eval/reporter.js";

const cli = yargs(hideBin(process.argv))
.scriptName("orvl")
Expand All @@ -33,6 +35,60 @@ cli.command(
},
);

cli.command(
"evaluate-pr <url>",
"Evaluate a PR as a benchmark candidate",
async (yargs) =>
yargs
.positional("url", {
type: "string",
description: "GitHub PR URL (e.g., https://github.com/owner/repo/pull/123)",
required: true,
})
.option("output", {
type: "string",
description: "Output file path for JSON results",
})
.example([
["orvl evaluate-pr https://github.com/owner/repo/pull/123"],
["orvl evaluate-pr https://github.com/owner/repo/pull/123 --output result.json"],
]),
async ({ url, output }) => {
if (!url) throw new Error("PR URL is required");

const logger = Logger.create("[pr-eval]");

const result = await PrEval.evaluate(url, { logger });

// Print summary
logger.log("");
logger.log("=".repeat(60));
logger.log(`Final Score: ${result.finalScore.toFixed(1)}/100`);
logger.log(`Recommendation: ${result.recommendation.toUpperCase()}`);
logger.log("=".repeat(60));
logger.log("");

result.criteria.forEach((c) => {
const consensus = PrEval.getConsensusLevel(c.variance);
logger.log(`${c.displayName}: ${c.average.toFixed(0)}/100 (${consensus} consensus)`);
c.judges.forEach((j) => {
logger.log(` - ${j.judge}: ${j.score}/100`);
});
});

if (output) {
await writeFile(output, JSON.stringify(result, null, 2));
logger.log(`\nResults saved to ${output}`);
}

// Also print the formatted comment
logger.log("\n" + "=".repeat(60));
logger.log("FORMATTED COMMENT PREVIEW:");
logger.log("=".repeat(60) + "\n");
logger.log(Reporter.formatComment(result));
},
);

cli.command(
"$0 [agent]",
"Run benchmark",
Expand Down
82 changes: 82 additions & 0 deletions github/evaluate-pr.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bun
import { Logger } from "../src/util/logger.js";
import { PrEval } from "../src/pr-eval/index.js";
import { Reporter } from "../src/pr-eval/reporter.js";
import { addIssueComment, addIssueLabels } from "../src/util/github.js";

const prUrl = process.env.PR_URL;
const issueNumber = process.env.ISSUE_NUMBER;
const repoOwner = process.env.REPO_OWNER;
const repoName = process.env.REPO_NAME;

if (!prUrl) {
console.error("PR_URL environment variable is required");
process.exit(1);
}

if (!issueNumber) {
console.error("ISSUE_NUMBER environment variable is required");
process.exit(1);
}

if (!repoOwner || !repoName) {
console.error("REPO_OWNER and REPO_NAME environment variables are required");
process.exit(1);
}

const issueNum = parseInt(issueNumber, 10);
if (isNaN(issueNum)) {
console.error("ISSUE_NUMBER must be a valid number");
process.exit(1);
}

const logger = Logger.create("[pr-eval]");

async function main() {
logger.log(`Evaluating PR: ${prUrl}`);
logger.log(`Will post results to issue #${issueNum}`);

try {
const result = await PrEval.evaluate(prUrl, { logger });

const comment = Reporter.formatComment(result);
const labels = Reporter.getLabels(result);

logger.log(`Posting comment to ${repoOwner}/${repoName}#${issueNum}...`);
await addIssueComment(repoOwner, repoName, issueNum, comment);

logger.log(`Adding labels: ${labels.join(", ")}...`);
await addIssueLabels(repoOwner, repoName, issueNum, labels);

logger.log(`Evaluation complete: ${result.recommendation}`);
logger.log(`Final score: ${result.finalScore.toFixed(1)}/100`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
logger.error(`Evaluation failed: ${message}`);

const errorComment = `## Evaluation Failed

Unable to evaluate PR candidate:

\`\`\`
${message}
\`\`\`

Please check that:
- The PR URL is valid and the repository is public
- The PR exists and is accessible

If the issue persists, please contact a maintainer.`;

try {
await addIssueComment(repoOwner, repoName, issueNum, errorComment);
await addIssueLabels(repoOwner, repoName, issueNum, ["benchmark-evaluation-failed"]);
} catch (commentError) {
logger.error(`Failed to post error comment: ${commentError}`);
}

process.exit(1);
}
}

main();
138 changes: 138 additions & 0 deletions src/pr-eval/criteria/eval-feasibility.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import type { PrEvalContext } from "../fetcher.js";

export const systemPrompt = `You are evaluating whether a GitHub Pull Request can be FEASIBLY EVALUATED as a benchmark task.

**YOUR ROLE**: Determine if an AI agent's attempt to reproduce this PR can be objectively scored.

IMPORTANT: You must give a score from 0-100. Be strict but fair.

---

## WHAT TO EVALUATE

### Diff Size & Complexity:
1. **Appropriate diff size**
- Not too small (< 20 lines) - trivially simple
- Not too large (> 1000 lines) - unmanageable for evaluation
- Ideal: 50-500 lines of meaningful changes

2. **Complexity balance**
- Complex enough to be a meaningful challenge
- Not so complex that evaluation becomes ambiguous
- Should require thought, not just transcription

### Deterministic Verification:
1. **Objective success criteria**
- Can success be measured programmatically?
- Are there clear pass/fail conditions?
- Can we run automated checks (tests, linting, builds)?

2. **Test-based verification**
- Existing tests that must continue passing
- New tests that verify the specific change
- Build/lint checks that must succeed

3. **Diff-based verification**
- Key code patterns identifiable in expected output
- Logic equivalence can be assessed
- Not purely stylistic where any approach works

### Practical Constraints:
1. **Environment requirements**
- No special hardware needed (GPU, specific OS)
- No paid API keys required for testing
- Standard development environment sufficient

2. **Time constraints**
- Can be completed in reasonable time (< 30 min agent runtime)
- No long-running processes required for verification
- Dependencies can be installed quickly

3. **External dependencies**
- No external services needed during evaluation
- Self-contained within the repository
- No authentication to external systems

---

## SCORING RUBRIC

**90-100**: Highly feasible
- Clear verification via automated tests
- Appropriate diff size (100-500 lines)
- Deterministic success criteria
- Standard environment, no special requirements
- Quick setup and verification

**70-89**: Feasible with minor challenges
- Verification possible but requires some interpretation
- Slightly outside ideal size range
- Some ambiguity in success criteria
- Minor setup complexity

**50-69**: Marginally feasible
- Verification would be challenging
- Size at the edges (very small or quite large)
- Success criteria unclear in places
- Some environment complexity

**30-49**: Difficult to evaluate
- Verification very challenging or subjective
- Inappropriate size for benchmark
- Mostly subjective success criteria
- Complex environment or dependencies

**0-29**: Not feasible
- No clear verification method
- Way too large (1000+ lines) or trivially small (< 10 lines)
- Requires external services or paid APIs
- Cannot be evaluated objectively

---

Return JSON with 'score' (0-100) and 'rationale' explaining your assessment.`;

export function createUserPrompt(context: PrEvalContext): string {
const filesPreview = context.files
.slice(0, 20)
.map((f) => ` - ${f.filename} (${f.changes} changes)`)
.join("\n");

return `Evaluate this Pull Request for EVALUATION FEASIBILITY as a benchmark task.

## PR Information

**Title:** ${context.title}
**Repository:** ${context.owner}/${context.repo}

## Statistics

- Files changed: ${context.diffStats.filesChanged}
- Lines added: ${context.diffStats.additions}
- Lines deleted: ${context.diffStats.deletions}
- **Total line changes: ${context.diffStats.totalLines}**
- Contains test files: ${context.hasTests ? "Yes" : "No"}

## Files Changed

${filesPreview}
${context.files.length > 20 ? `... and ${context.files.length - 20} more files` : ""}

## PR Description

${context.body || "(No description provided)"}

## Diff Size Assessment

The diff is ${context.diffStats.totalLines} lines total.
${context.diffTruncated ? "Note: The full diff was truncated due to size (>50K chars), indicating a very large PR." : ""}

## Key Questions to Answer

1. **Size appropriateness**: Is ${context.diffStats.totalLines} lines a reasonable size for a benchmark task?
2. **Verification method**: Can we verify correctness through tests, builds, or diff comparison?
3. **Environment needs**: Does this require any special setup, external services, or paid APIs?
4. **Time feasibility**: Can an AI agent reasonably complete this in under 30 minutes?

Based on the above, evaluate the EVALUATION FEASIBILITY of using this PR as a benchmark task.`;
}
Loading