-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate-balanced-sample.ts
More file actions
90 lines (74 loc) · 2.97 KB
/
generate-balanced-sample.ts
File metadata and controls
90 lines (74 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import fs from 'fs';
// Configuration
const SAMPLE_SIZE = 50;
const INPUT_FILE = 'bugs.json';
const OUTPUT_FILE = 'comments-easy-bugs.json';
const DIFFICULTY_LEVEL = 'easy'; // Only include easy bugs
interface Bug {
repo: string;
LLMReviewPassed: boolean;
reReviewedAfterJan27th: boolean;
difficultyLevel?: string; // Add difficulty level property
[key: string]: any; // for other properties we don't care about in this script
}
// Read and parse the bugs file
const bugs = JSON.parse(fs.readFileSync(INPUT_FILE, 'utf-8')) as Bug[];
// Filter for bugs that passed LLM review, were re-reviewed after Jan 27th, and have "easy" difficulty
const eligibleBugs = bugs.filter(bug =>
bug.LLMReviewPassed === true &&
bug.reReviewedAfterJan27th === true &&
bug.difficultyLevel === DIFFICULTY_LEVEL
);
console.log(`Found ${eligibleBugs.length} eligible ${DIFFICULTY_LEVEL} bugs out of ${bugs.length} total bugs`);
// Group bugs by repository
const bugsByRepo = eligibleBugs.reduce((acc: { [key: string]: any[] }, bug: any) => {
if (!acc[bug.repo]) {
acc[bug.repo] = [];
}
acc[bug.repo].push(bug);
return acc;
}, {});
// Get list of repos with their bug counts
const repoStats = Object.entries(bugsByRepo)
.map(([repo, bugs]) => ({
repo,
count: bugs.length
}))
.sort((a, b) => b.count - a.count); // Sort by count descending
console.log('\n=== Available Repositories with Easy Bugs ===');
repoStats.forEach(({repo, count}) => {
console.log(`${repo}: ${count} eligible ${DIFFICULTY_LEVEL} bugs`);
});
// Select bugs trying to maximize repo diversity
let selectedBugs: any[] = [];
let currentRepoIndex = 0;
while (selectedBugs.length < SAMPLE_SIZE && currentRepoIndex < repoStats.length) {
const repo = repoStats[currentRepoIndex].repo;
const repoBugs = bugsByRepo[repo];
// Take one bug from current repo if available
if (repoBugs.length > 0) {
selectedBugs.push(repoBugs.shift()); // Take and remove the first bug
}
// Move to next repo, or back to start if we've gone through all repos
currentRepoIndex = (currentRepoIndex + 1) % repoStats.length;
// Break if we've gone through all repos and can't find more bugs
if (currentRepoIndex === 0 && selectedBugs.length < SAMPLE_SIZE &&
repoStats.every(({repo}) => bugsByRepo[repo].length === 0)) {
break;
}
}
// Write to output file
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(selectedBugs, null, 2));
// Print statistics
console.log('\n=== Sample Generation Statistics ===');
console.log(`Total ${DIFFICULTY_LEVEL} bugs selected: ${selectedBugs.length}`);
console.log('\nDistribution by repository:');
const finalDistribution = selectedBugs.reduce((acc: { [key: string]: number }, bug: any) => {
acc[bug.repo] = (acc[bug.repo] || 0) + 1;
return acc;
}, {});
Object.entries(finalDistribution)
.sort((a, b) => b[1] - a[1])
.forEach(([repo, count]) => {
console.log(`- ${repo}: ${count} bugs`);
});