SWE-bench
diff --git a/‎css/components.css‎
Lines changed: 121 additions & 0 deletions b/‎css/components.css‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎data/combine_per_instance_details.py‎
Lines changed: 155 additions & 0 deletions b/‎data/combine_per_instance_details.py‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎data/info_for_leaderboard.json‎
Lines changed: 1 addition & 0 deletions b/‎data/info_for_leaderboard.json‎
Lines changed: 1 addition & 0 deletions
@@ -367,6 +367,40 @@ button:focus,
   white-space: nowrap;
 }
 
+/* Selection checkbox column (fixed slim width) */
+.data-table.has-select-col th.select-col,
+.data-table.has-select-col td.select-col {
+  width: 36px;
+  min-width: 36px;
+  max-width: 36px;
+  text-align: center;
+  white-space: nowrap;
+}
+
+/* Prevent first (model) column from expanding when selection column present */
+.data-table.has-select-col th.sortable[data-sort="name"],
+.data-table.has-select-col td:first-of-type + td {
+  width: auto;
+  white-space: normal;
+}
+
+/* Ensure model column remains flexible when select column exists */
+.data-table.has-select-col th:nth-child(2),
+.data-table.has-select-col td:nth-child(2) {
+  width: 40%;
+  min-width: 180px;
+  max-width: 350px;
+  white-space: normal;
+  word-wrap: break-word;
+  text-align: left; /* override default % resolved right alignment */
+}
+
+/* Keep % Resolved right-aligned (now 3rd column when select column exists) */
+.data-table.has-select-col th:nth-child(3),
+.data-table.has-select-col td:nth-child(3) {
+  text-align: right;
+}
+
 /* Cards */
 .card {
   background-color: var(--color-background);
@@ -616,6 +650,19 @@ button:focus,
   }
 }
 
+/* Modal basic styles */
+.modal { display: none; position: fixed; inset: 0; z-index: var(--z-modal); }
+.modal.show { display: block; }
+.modal-backdrop { position: absolute; inset: 0; background: rgba(0,0,0,0.45); }
+.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; display: flex; flex-direction: column; }
+.modal-dialog-small { width: min(480px, calc(100vw - 2rem)); min-width: 320px; min-height: auto; resize: none; }
+.modal-dialog-large { width: min(1200px, 90vw); height: min(800px, 90vh); }
+.modal-header { display: flex; align-items: center; justify-content: space-between; padding: 0.75rem 1rem; border-bottom: 1.5px solid var(--color-border); }
+.modal-body { padding: 1rem; overflow: auto; flex: 1; display: flex; flex-direction: column; }
+.modal-close { background: transparent; border: none; cursor: pointer; color: var(--color-text-secondary); }
+.chart-container { flex: 1; display: flex; flex-direction: column; min-height: 0; position: relative; }
+.chart-container canvas { flex: 1; min-height: 260px; }
+
 @media (max-width: 992px) {
   /* On mobile and tablets */
   .table-responsive {
@@ -919,3 +966,77 @@ button:focus,
   text-decoration-thickness: 2px;
   text-decoration-color: var(--color-text-muted);
 }
+
+/* New Feature Badge */
+.new-badge {
+  position: absolute;
+  top: 100%;
+  left: 50%;
+  transform: translateX(-50%);
+  margin-top: 2px;
+  background: linear-gradient(135deg, var(--color-accent), var(--color-accent-dark));
+  color: white;
+  padding: 0.25rem 0.5rem;
+  border-radius: var(--radius-full);
+  font-size: 0.7rem;
+  font-weight: var(--weight-medium);
+  white-space: nowrap;
+  box-shadow: 0 2px 8px rgba(59, 130, 246, 0.4);
+  z-index: 100;
+  animation: newBadgeAnimation 6s ease-in-out forwards;
+  pointer-events: none;
+}
+
+.dark-mode .new-badge {
+  background: linear-gradient(135deg, var(--blue-400), var(--blue-600));
+  box-shadow: 0 2px 8px rgba(59, 130, 246, 0.6);
+}
+
+.new-badge-button {
+  top: -12px;
+  left: auto;
+  right: -45px;
+}
+
+@keyframes newBadgeAnimation {
+  0% {
+    opacity: 0;
+    transform: translateX(-50%) scale(0.8);
+  }
+  10% {
+    opacity: 1;
+    transform: translateX(-50%) scale(1);
+  }
+  15% {
+    transform: translateX(-50%) scale(1.1);
+  }
+  20% {
+    transform: translateX(-50%) scale(1);
+  }
+  30% {
+    transform: translateX(-50%) scale(1.05);
+  }
+  35% {
+    transform: translateX(-50%) scale(1);
+  }
+  45% {
+    transform: translateX(-50%) scale(1.05);
+  }
+  50% {
+    transform: translateX(-50%) scale(1);
+  }
+  60% {
+    transform: translateX(-50%) scale(1.05);
+  }
+  65% {
+    transform: translateX(-50%) scale(1);
+  }
+  85% {
+    opacity: 1;
+    transform: translateX(-50%) scale(1);
+  }
+  100% {
+    opacity: 0;
+    transform: translateX(-50%) scale(0.8);
+  }
+}
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Script to combine per-instance details from info_for_leaderboard.json
+into the leaderboards.json file for all model entries.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+# Mapping from info_for_leaderboard.json keys to leaderboard entry names
+MODEL_MAPPING = {
+    'gpt-5': 'GPT-5 (2025-08-07) (medium reasoning)',
+    'gpt-5-mini': 'GPT-5 mini (2025-08-07) (medium reasoning)',
+    'sonnet-4': 'Claude 4 Sonnet (20250514)',
+    'sonnet-4-5': 'Claude 4.5 Sonnet (20250929)',
+}
+
+
+def main():
+    # Define file paths
+    script_dir = Path(__file__).parent
+    info_file = script_dir / "info_for_leaderboard.json"
+    leaderboards_file = script_dir / "leaderboards.json"
+    backup_file = script_dir / "leaderboards.json.backup"
+
+    # Check files exist
+    if not info_file.exists():
+        print(f"Error: {info_file} not found")
+        return 1
+
+    if not leaderboards_file.exists():
+        print(f"Error: {leaderboards_file} not found")
+        return 1
+
+    # Load the info file
+    print(f"Loading {info_file}...")
+    with open(info_file, 'r') as f:
+        info_data = json.load(f)
+
+    print(f"Found {len(info_data)} model entries in info file")
+    print(f"Available models: {list(info_data.keys())}")
+
+    # Load leaderboards
+    print(f"\nLoading {leaderboards_file}...")
+    with open(leaderboards_file, 'r') as f:
+        leaderboards_data = json.load(f)
+
+    # Find bash-only leaderboard
+    bash_only = None
+    bash_only_idx = None
+    for idx, lb in enumerate(leaderboards_data['leaderboards']):
+        if lb.get('name') == 'bash-only':
+            bash_only = lb
+            bash_only_idx = idx
+            break
+
+    if bash_only is None:
+        print("Error: 'bash-only' leaderboard not found")
+        return 1
+
+    print(f"Found 'bash-only' leaderboard with {len(bash_only['results'])} entries")
+
+    # Track which models will be updated
+    models_to_update = []
+    for info_key, leaderboard_name in MODEL_MAPPING.items():
+        if info_key not in info_data:
+            print(f"\nWarning: '{info_key}' not found in info file, skipping...")
+            continue
+
+        # Find the entry in leaderboard
+        entry_idx = None
+        for idx, result in enumerate(bash_only['results']):
+            if result.get('name') == leaderboard_name:
+                entry_idx = idx
+                break
+
+        if entry_idx is None:
+            print(f"\nWarning: '{leaderboard_name}' not found in leaderboard, skipping...")
+            continue
+
+        # Check if already has per_instance_details
+        has_details = 'per_instance_details' in bash_only['results'][entry_idx]
+        num_instances = len(info_data[info_key])
+
+        models_to_update.append({
+            'info_key': info_key,
+            'leaderboard_name': leaderboard_name,
+            'entry_idx': entry_idx,
+            'num_instances': num_instances,
+            'has_details': has_details,
+        })
+
+        status = "(will overwrite)" if has_details else "(new)"
+        print(f"\n  - {leaderboard_name} {status}")
+        print(f"    {num_instances} instances from '{info_key}'")
+
+    if not models_to_update:
+        print("\nError: No models to update")
+        return 1
+
+    # Ask for confirmation
+    print(f"\n{'='*60}")
+    print(f"Will update {len(models_to_update)} model(s)")
+
+    overwrite_count = sum(1 for m in models_to_update if m['has_details'])
+    if overwrite_count > 0:
+        print(f"Warning: {overwrite_count} model(s) already have per_instance_details")
+
+    response = input("\nContinue? (yes/no): ").strip().lower()
+    if response != 'yes':
+        print("Aborted.")
+        return 0
+
+    # Create backup
+    print(f"\nCreating backup at {backup_file}...")
+    with open(backup_file, 'w') as f:
+        json.dump(leaderboards_data, f, indent=2)
+
+    # Update all models
+    print("\nUpdating models...")
+    for model in models_to_update:
+        info_key = model['info_key']
+        entry_idx = model['entry_idx']
+        leaderboard_name = model['leaderboard_name']
+
+        per_instance_details = info_data[info_key]
+        leaderboards_data['leaderboards'][bash_only_idx]['results'][entry_idx]['per_instance_details'] = per_instance_details
+
+        print(f"  ✓ {leaderboard_name}: {len(per_instance_details)} instances")
+
+    # Write updated data
+    print(f"\nWriting updated data to {leaderboards_file}...")
+    with open(leaderboards_file, 'w') as f:
+        json.dump(leaderboards_data, f, indent=2)
+
+    print("\n" + "="*60)
+    print("✓ Success! All models updated")
+    print(f"  - Backup saved to: {backup_file}")
+    print(f"  - Models updated: {len(models_to_update)}")
+
+    # Show sample of added data for first model
+    if models_to_update:
+        first_model = models_to_update[0]
+        print(f"\nSample instances from {first_model['leaderboard_name']}:")
+        sample_data = info_data[first_model['info_key']]
+        for i, (key, value) in enumerate(list(sample_data.items())[:3]):
+            print(f"  - {key}: resolved={value.get('resolved')}, cost={value.get('cost')}")
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())