Skip to content

Commit a0304f2

Browse files
authored
Merge pull request #39 from SWE-bench/add-comparison-plots
add comparison plots
2 parents c7f5d65 + d6e0839 commit a0304f2

16 files changed

+12141
-10
lines changed

css/components.css

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,40 @@ button:focus,
367367
white-space: nowrap;
368368
}
369369

370+
/* Selection checkbox column (fixed slim width) */
371+
.data-table.has-select-col th.select-col,
372+
.data-table.has-select-col td.select-col {
373+
width: 36px;
374+
min-width: 36px;
375+
max-width: 36px;
376+
text-align: center;
377+
white-space: nowrap;
378+
}
379+
380+
/* Prevent first (model) column from expanding when selection column present */
381+
.data-table.has-select-col th.sortable[data-sort="name"],
382+
.data-table.has-select-col td:first-of-type + td {
383+
width: auto;
384+
white-space: normal;
385+
}
386+
387+
/* Ensure model column remains flexible when select column exists */
388+
.data-table.has-select-col th:nth-child(2),
389+
.data-table.has-select-col td:nth-child(2) {
390+
width: 40%;
391+
min-width: 180px;
392+
max-width: 350px;
393+
white-space: normal;
394+
word-wrap: break-word;
395+
text-align: left; /* override default % resolved right alignment */
396+
}
397+
398+
/* Keep % Resolved right-aligned (now 3rd column when select column exists) */
399+
.data-table.has-select-col th:nth-child(3),
400+
.data-table.has-select-col td:nth-child(3) {
401+
text-align: right;
402+
}
403+
370404
/* Cards */
371405
.card {
372406
background-color: var(--color-background);
@@ -616,6 +650,19 @@ button:focus,
616650
}
617651
}
618652

653+
/* Modal basic styles */
654+
.modal { display: none; position: fixed; inset: 0; z-index: var(--z-modal); }
655+
.modal.show { display: block; }
656+
.modal-backdrop { position: absolute; inset: 0; background: rgba(0,0,0,0.45); }
657+
.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; display: flex; flex-direction: column; }
658+
.modal-dialog-small { width: min(480px, calc(100vw - 2rem)); min-width: 320px; min-height: auto; resize: none; }
659+
.modal-dialog-large { width: min(1200px, 90vw); height: min(800px, 90vh); }
660+
.modal-header { display: flex; align-items: center; justify-content: space-between; padding: 0.75rem 1rem; border-bottom: 1.5px solid var(--color-border); }
661+
.modal-body { padding: 1rem; overflow: auto; flex: 1; display: flex; flex-direction: column; }
662+
.modal-close { background: transparent; border: none; cursor: pointer; color: var(--color-text-secondary); }
663+
.chart-container { flex: 1; display: flex; flex-direction: column; min-height: 0; position: relative; }
664+
.chart-container canvas { flex: 1; min-height: 260px; }
665+
619666
@media (max-width: 992px) {
620667
/* On mobile and tablets */
621668
.table-responsive {
@@ -919,3 +966,77 @@ button:focus,
919966
text-decoration-thickness: 2px;
920967
text-decoration-color: var(--color-text-muted);
921968
}
969+
970+
/* New Feature Badge */
971+
.new-badge {
972+
position: absolute;
973+
top: 100%;
974+
left: 50%;
975+
transform: translateX(-50%);
976+
margin-top: 2px;
977+
background: linear-gradient(135deg, var(--color-accent), var(--color-accent-dark));
978+
color: white;
979+
padding: 0.25rem 0.5rem;
980+
border-radius: var(--radius-full);
981+
font-size: 0.7rem;
982+
font-weight: var(--weight-medium);
983+
white-space: nowrap;
984+
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.4);
985+
z-index: 100;
986+
animation: newBadgeAnimation 6s ease-in-out forwards;
987+
pointer-events: none;
988+
}
989+
990+
.dark-mode .new-badge {
991+
background: linear-gradient(135deg, var(--blue-400), var(--blue-600));
992+
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.6);
993+
}
994+
995+
.new-badge-button {
996+
top: -12px;
997+
left: auto;
998+
right: -45px;
999+
}
1000+
1001+
@keyframes newBadgeAnimation {
1002+
0% {
1003+
opacity: 0;
1004+
transform: translateX(-50%) scale(0.8);
1005+
}
1006+
10% {
1007+
opacity: 1;
1008+
transform: translateX(-50%) scale(1);
1009+
}
1010+
15% {
1011+
transform: translateX(-50%) scale(1.1);
1012+
}
1013+
20% {
1014+
transform: translateX(-50%) scale(1);
1015+
}
1016+
30% {
1017+
transform: translateX(-50%) scale(1.05);
1018+
}
1019+
35% {
1020+
transform: translateX(-50%) scale(1);
1021+
}
1022+
45% {
1023+
transform: translateX(-50%) scale(1.05);
1024+
}
1025+
50% {
1026+
transform: translateX(-50%) scale(1);
1027+
}
1028+
60% {
1029+
transform: translateX(-50%) scale(1.05);
1030+
}
1031+
65% {
1032+
transform: translateX(-50%) scale(1);
1033+
}
1034+
85% {
1035+
opacity: 1;
1036+
transform: translateX(-50%) scale(1);
1037+
}
1038+
100% {
1039+
opacity: 0;
1040+
transform: translateX(-50%) scale(0.8);
1041+
}
1042+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script to combine per-instance details from info_for_leaderboard.json
4+
into the leaderboards.json file for all model entries.
5+
"""
6+
7+
import json
8+
import sys
9+
from pathlib import Path
10+
11+
12+
# Mapping from info_for_leaderboard.json keys to leaderboard entry names
13+
MODEL_MAPPING = {
14+
'gpt-5': 'GPT-5 (2025-08-07) (medium reasoning)',
15+
'gpt-5-mini': 'GPT-5 mini (2025-08-07) (medium reasoning)',
16+
'sonnet-4': 'Claude 4 Sonnet (20250514)',
17+
'sonnet-4-5': 'Claude 4.5 Sonnet (20250929)',
18+
}
19+
20+
21+
def main():
22+
# Define file paths
23+
script_dir = Path(__file__).parent
24+
info_file = script_dir / "info_for_leaderboard.json"
25+
leaderboards_file = script_dir / "leaderboards.json"
26+
backup_file = script_dir / "leaderboards.json.backup"
27+
28+
# Check files exist
29+
if not info_file.exists():
30+
print(f"Error: {info_file} not found")
31+
return 1
32+
33+
if not leaderboards_file.exists():
34+
print(f"Error: {leaderboards_file} not found")
35+
return 1
36+
37+
# Load the info file
38+
print(f"Loading {info_file}...")
39+
with open(info_file, 'r') as f:
40+
info_data = json.load(f)
41+
42+
print(f"Found {len(info_data)} model entries in info file")
43+
print(f"Available models: {list(info_data.keys())}")
44+
45+
# Load leaderboards
46+
print(f"\nLoading {leaderboards_file}...")
47+
with open(leaderboards_file, 'r') as f:
48+
leaderboards_data = json.load(f)
49+
50+
# Find bash-only leaderboard
51+
bash_only = None
52+
bash_only_idx = None
53+
for idx, lb in enumerate(leaderboards_data['leaderboards']):
54+
if lb.get('name') == 'bash-only':
55+
bash_only = lb
56+
bash_only_idx = idx
57+
break
58+
59+
if bash_only is None:
60+
print("Error: 'bash-only' leaderboard not found")
61+
return 1
62+
63+
print(f"Found 'bash-only' leaderboard with {len(bash_only['results'])} entries")
64+
65+
# Track which models will be updated
66+
models_to_update = []
67+
for info_key, leaderboard_name in MODEL_MAPPING.items():
68+
if info_key not in info_data:
69+
print(f"\nWarning: '{info_key}' not found in info file, skipping...")
70+
continue
71+
72+
# Find the entry in leaderboard
73+
entry_idx = None
74+
for idx, result in enumerate(bash_only['results']):
75+
if result.get('name') == leaderboard_name:
76+
entry_idx = idx
77+
break
78+
79+
if entry_idx is None:
80+
print(f"\nWarning: '{leaderboard_name}' not found in leaderboard, skipping...")
81+
continue
82+
83+
# Check if already has per_instance_details
84+
has_details = 'per_instance_details' in bash_only['results'][entry_idx]
85+
num_instances = len(info_data[info_key])
86+
87+
models_to_update.append({
88+
'info_key': info_key,
89+
'leaderboard_name': leaderboard_name,
90+
'entry_idx': entry_idx,
91+
'num_instances': num_instances,
92+
'has_details': has_details,
93+
})
94+
95+
status = "(will overwrite)" if has_details else "(new)"
96+
print(f"\n - {leaderboard_name} {status}")
97+
print(f" {num_instances} instances from '{info_key}'")
98+
99+
if not models_to_update:
100+
print("\nError: No models to update")
101+
return 1
102+
103+
# Ask for confirmation
104+
print(f"\n{'='*60}")
105+
print(f"Will update {len(models_to_update)} model(s)")
106+
107+
overwrite_count = sum(1 for m in models_to_update if m['has_details'])
108+
if overwrite_count > 0:
109+
print(f"Warning: {overwrite_count} model(s) already have per_instance_details")
110+
111+
response = input("\nContinue? (yes/no): ").strip().lower()
112+
if response != 'yes':
113+
print("Aborted.")
114+
return 0
115+
116+
# Create backup
117+
print(f"\nCreating backup at {backup_file}...")
118+
with open(backup_file, 'w') as f:
119+
json.dump(leaderboards_data, f, indent=2)
120+
121+
# Update all models
122+
print("\nUpdating models...")
123+
for model in models_to_update:
124+
info_key = model['info_key']
125+
entry_idx = model['entry_idx']
126+
leaderboard_name = model['leaderboard_name']
127+
128+
per_instance_details = info_data[info_key]
129+
leaderboards_data['leaderboards'][bash_only_idx]['results'][entry_idx]['per_instance_details'] = per_instance_details
130+
131+
print(f" ✓ {leaderboard_name}: {len(per_instance_details)} instances")
132+
133+
# Write updated data
134+
print(f"\nWriting updated data to {leaderboards_file}...")
135+
with open(leaderboards_file, 'w') as f:
136+
json.dump(leaderboards_data, f, indent=2)
137+
138+
print("\n" + "="*60)
139+
print("✓ Success! All models updated")
140+
print(f" - Backup saved to: {backup_file}")
141+
print(f" - Models updated: {len(models_to_update)}")
142+
143+
# Show sample of added data for first model
144+
if models_to_update:
145+
first_model = models_to_update[0]
146+
print(f"\nSample instances from {first_model['leaderboard_name']}:")
147+
sample_data = info_data[first_model['info_key']]
148+
for i, (key, value) in enumerate(list(sample_data.items())[:3]):
149+
print(f" - {key}: resolved={value.get('resolved')}, cost={value.get('cost')}")
150+
151+
return 0
152+
153+
154+
if __name__ == '__main__':
155+
sys.exit(main())

data/info_for_leaderboard.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)