From e3005d8170315ed6bd6ea28e892fee4c48eca22e Mon Sep 17 00:00:00 2001
From: "Bubba (AmnesiaBench bot)" <bubba@voynich.ai>
Date: Sun, 29 Mar 2026 00:54:25 -0400
Subject: [PATCH 1/2] feat: OpenRouter support + 7 new AIMO3 hard problems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add openrouter:// URL scheme to create_client()
- LLMClient now accepts api_key (Bearer auth header) and model_name
  (required by OpenRouter for model routing)
- Fix api key resolution for non-Gemini remote APIs (checks
  OPENROUTER_API_KEY env when model URL is openrouter://)
- Fix ping() to skip /health check for remote APIs
- Update models.json with 4 OpenRouter free models:
  - openai/gpt-oss-120b:free (same model as calibration data)
  - openai/gpt-oss-20b:free
  - nvidia/nemotron-3-super-120b-a12b:free
  - qwen/qwen3-next-80b-a3b-instruct:free
- Add 7 new AIMO3 hard problems (all pass_rate=0.125 on 20B,
  diverse topics, high solution token count):
  aimo3_hard_5f9595ae, aimo3_hard_f728b4b1, aimo3_hard_b0dc264b,
  aimo3_hard_858cba58, aimo3_hard_00eaa992, aimo3_hard_b1da52fa,
  aimo3_hard_2e0b7ba3

Tested: gpt-oss-20b on digit_sum_ten → minimum_window=672 ✓
---
 amnesia_bench.py                  | 1190 ++++++++++++++++++++++++-----
 models.json                       |   32 +
 problems/aimo3_hard_00eaa992.json |   10 +
 problems/aimo3_hard_2e0b7ba3.json |   10 +
 problems/aimo3_hard_5f9595ae.json |   10 +
 problems/aimo3_hard_858cba58.json |   10 +
 problems/aimo3_hard_b0dc264b.json |   10 +
 problems/aimo3_hard_b1da52fa.json |   10 +
 problems/aimo3_hard_f728b4b1.json |   10 +
 9 files changed, 1102 insertions(+), 190 deletions(-)
 create mode 100644 models.json
 create mode 100644 problems/aimo3_hard_00eaa992.json
 create mode 100644 problems/aimo3_hard_2e0b7ba3.json
 create mode 100644 problems/aimo3_hard_5f9595ae.json
 create mode 100644 problems/aimo3_hard_858cba58.json
 create mode 100644 problems/aimo3_hard_b0dc264b.json
 create mode 100644 problems/aimo3_hard_b1da52fa.json
 create mode 100644 problems/aimo3_hard_f728b4b1.json

diff --git a/amnesia_bench.py b/amnesia_bench.py
index e106d73..854be4c 100644
--- a/amnesia_bench.py
+++ b/amnesia_bench.py
@@ -1,18 +1,42 @@
 #!/usr/bin/env python3
+# Author: Claude Sonnet 4.6 (Bubba)
+# Date: 28-March-2026
+# PURPOSE: AmnesiaBench v2 — multi-model, multi-problem benchmark runner that binary-searches
+#   for the minimum context window (n_reliable) at which each LLM can solve competition-math
+#   problems at 60% success rate. Supports 10 problems × N models for overnight runs.
+#   Features: prediction phase, composite Scott scoring, --model / --model-name flags,
+#   --run-all-models mode reading models.json, per-model result namespacing, full scoring table.
+#   Supports llama.cpp (http://) and Google Gemini (gemini://) backends via create_client().
+#   Exponential backoff applied to all external API calls (429/503 retry with jitter).
+#   Integration points: run_prediction_phase() → run_problem() → binary_search() → run_trial().
+# SRP/DRY check: Pass — prediction phase, scoring, model iteration all isolated. No duplication
+#   of result I/O. calculate_scores() is the single scoring engine. run_all_models() delegates
+#   to run_problem() so the multi-model path is just a loop around the single-model path.
+#   with_exponential_backoff() is the single retry engine used by both LLMClient and GeminiClient.
 """
-AmnesiaBench — How much context does a model actually need?
+AmnesiaBench v2 — How much context does a model actually need?
 
 Binary-searches (log scale) for the minimum context window at which an LLM
-can solve competition-math problems at a 20% success rate.
-
-4 configurations: {TIR, No-TIR} x {Hard Cutoff, Compaction}
-5 trials per window size. Full conversation traces saved.
+can solve competition-math problems at a 60% success rate. Runs prediction
+phase, computes composite Scott scores, supports multi-model overnight runs.
+Supports both llama.cpp (http://) and Google Gemini (gemini://) backends.
 
 Usage:
-    # Start llama.cpp server first, then:
-    python3 amnesia_bench.py --problem ab507a9f
-    python3 amnesia_bench.py --all
+    # Single problem, single model (llama.cpp):
+    python3 amnesia_bench.py --problem ab507a9f --model http://localhost:8080 --model-name Qwen35B
+
+    # All problems, single model:
+    python3 amnesia_bench.py --all --model http://localhost:8080 --model-name Qwen35B
+
+    # All problems, Gemini:
+    python3 amnesia_bench.py --all --model gemini://gemini-2.0-flash-lite --api-key $GEMINI_API_KEY
+
+    # All problems, all models from models.json:
+    python3 amnesia_bench.py --all --run-all-models
+
+    # Analysis:
     python3 amnesia_bench.py --analyze
+    python3 amnesia_bench.py --scores
 """
 
 import argparse
@@ -22,6 +46,7 @@
 import json
 import math
 import os
+import random
 import re
 import signal
 import sys
@@ -31,25 +56,30 @@
 from dataclasses import dataclass, field, asdict
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import requests
 
 # ─── Defaults ────────────────────────────────────────────────────────────────
 
 SERVER_URL = "http://localhost:8080"
-MIN_WINDOW = 512
+MIN_WINDOW = 1
 MAX_WINDOW = 32768
-TRIALS_PER_WINDOW = 5
-SUCCESS_THRESHOLD = 0.2          # 1/5 = 20%
-CONVERGENCE_RATIO = 1.05         # stop when hi/lo < 5%
+TRIALS_PER_WINDOW = 3
+SUCCESS_THRESHOLD = 0.6          # 60%
+CONVERGENCE_RATIO = 1.05         # stop when hi/lo < 5% (fallback)
+CONVERGENCE_ABS = 50             # stop when hi - lo < 50 tokens (primary)
+MAX_COMPLETION_TOKENS = 16384    # hard cap per generation turn
 MAX_COMPACTIONS = 5
-COMPACTION_TRIGGER = 0.70        # compact at 70% of budget
-MAX_TURNS = 40                   # safety: max conversation turns
-CODE_TIMEOUT = 30                # seconds per code execution
+COMPACTION_TRIGGER = 0.70
+MAX_TURNS = 40
+CODE_TIMEOUT = 30
 TEMPERATURE = 0.7
 PROBLEMS_DIR = Path(__file__).parent / "problems"
 RESULTS_DIR = Path(__file__).parent / "results"
+MODELS_JSON = Path(__file__).parent / "models.json"
+
+DEFAULT_COMPACTION_PROMPT = "Compact your context window to continue."
 
 # ─── Prompt Templates ────────────────────────────────────────────────────────
 
@@ -84,6 +114,27 @@
 ---
 Continue solving. Give your final answer as \\boxed{{integer}}."""
 
+PREDICTION_PROMPT = """\
+You are about to solve a math problem under context window constraints.
+
+--- PROBLEM ---
+{problem_text}
+
+--- SCORING ---
+Your score depends on:
+- n_reliable: minimum context window where you solve this 60%+ of the time
+- n_reliable_prediction: your prediction of n_reliable (before we test)
+- Lower n_reliable_prediction = better score IF correct; if too low and you fail = infinity penalty
+- success_prediction: whether you can solve this at all
+
+You may opt out (success_prediction=False) if you think you cannot solve this problem.
+
+--- INSTRUCTIONS ---
+Respond in 300 tokens or less. Include these tags:
+<success_prediction>True or False</success_prediction>
+<n_reliable_prediction>integer (tokens)</n_reliable_prediction>
+<compaction_prompt>one sentence describing what to preserve when compacting</compaction_prompt>"""
+
 
 # ─── Python Sandbox ──────────────────────────────────────────────────────────
 
@@ -124,63 +175,125 @@ def reset(self):
         self.namespace = {"__builtins__": __builtins__}
 
 
+# ─── Exponential Backoff ─────────────────────────────────────────────────────
+
+def with_exponential_backoff(fn, max_retries=5, base_delay=1.0, max_delay=60.0):
+    """
+    Wrap any API call with exponential backoff on 429/503 errors.
+    Uses full jitter: delay = min(base * 2^attempt + uniform(0,1), max_delay).
+    Raises immediately on non-retriable errors or when retries are exhausted.
+    """
+    for attempt in range(max_retries):
+        try:
+            return fn()
+        except requests.HTTPError as e:
+            if e.response.status_code in (429, 503) and attempt < max_retries - 1:
+                delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
+                print(f"    [backoff] {e.response.status_code} — retrying in {delay:.1f}s (attempt {attempt+1}/{max_retries})")
+                time.sleep(delay)
+            else:
+                raise
+
+
 # ─── LLM Client ─────────────────────────────────────────────────────────────
 
 class LLMClient:
-    """Wrapper for llama.cpp /v1/chat/completions."""
+    """Wrapper for llama.cpp or any OpenAI-compatible /v1/chat/completions endpoint."""
 
-    def __init__(self, server_url: str = SERVER_URL, temperature: float = TEMPERATURE):
+    def __init__(self, server_url: str = SERVER_URL, temperature: float = TEMPERATURE, api_key: str = None, model_name: str = None):
         self.server_url = server_url.rstrip("/")
         self.temperature = temperature
+        self.model_name = model_name  # passed to API as model field (required by OpenRouter)
+        self.auth_header = {"Authorization": f"Bearer {api_key}"} if api_key else {}
 
     def generate(self, messages: list[dict], max_tokens: int) -> dict:
         """
-        Send messages to the model. Returns:
-        {
-            "content": str,
-            "prompt_tokens": int,
-            "completion_tokens": int,
-            "total_tokens": int,
-            "finish_reason": str,
-        }
+        Send messages to the model. Returns usage + content dict.
+        Uses exponential backoff on 429/503 errors.
         """
         max_tokens = max(1, max_tokens)
         payload = {
             "messages": messages,
             "max_tokens": max_tokens,
             "temperature": self.temperature,
-            "stream": False,
+            "stream": True,
         }
-        resp = requests.post(
-            f"{self.server_url}/v1/chat/completions",
-            json=payload,
-            timeout=3600,  # 1 hour — large windows with parallel slots are slow
-        )
-        resp.raise_for_status()
-        data = resp.json()
-        choice = data["choices"][0]
-        usage = data.get("usage", {})
-        msg = choice["message"]
-        # Qwen3.5 splits thinking into reasoning_content, final answer into content
-        reasoning = msg.get("reasoning_content", "") or ""
-        content = msg.get("content", "") or ""
-        # Combine both for our purposes — the model's full output
+
+        def _do_request():
+            if self.model_name:
+                payload["model"] = self.model_name
+            resp = requests.post(
+                f"{self.server_url}/v1/chat/completions",
+                headers=self.auth_header,
+                json=payload,
+                timeout=3600,
+                stream=True,
+            )
+            resp.raise_for_status()
+            return resp
+
+        resp = with_exponential_backoff(_do_request)
+
         full_content = ""
+        reasoning = ""
+        content = ""
+        prompt_tokens = 0
+        completion_tokens = 0
+        total_tokens = 0
+        finish_reason = "unknown"
+
+        print("    [stream] ", end="", flush=True)
+        for line in resp.iter_lines():
+            if not line:
+                continue
+            line = line.decode("utf-8") if isinstance(line, bytes) else line
+            if line.startswith("data: "):
+                line = line[6:]
+            if line == "[DONE]":
+                break
+            try:
+                chunk = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            choice = chunk.get("choices", [{}])[0]
+            delta = choice.get("delta", {})
+            r_piece = delta.get("reasoning_content", "") or ""
+            c_piece = delta.get("content", "") or ""
+            if r_piece:
+                reasoning += r_piece
+                sys.stdout.write(r_piece)
+                sys.stdout.flush()
+            if c_piece:
+                content += c_piece
+                sys.stdout.write(c_piece)
+                sys.stdout.flush()
+            finish_reason = choice.get("finish_reason") or finish_reason
+            usage = chunk.get("usage", {})
+            if usage:
+                prompt_tokens = usage.get("prompt_tokens", prompt_tokens)
+                completion_tokens = usage.get("completion_tokens", completion_tokens)
+                total_tokens = usage.get("total_tokens", total_tokens)
+        print()
+
         if reasoning:
-            full_content += f"<think>\n{reasoning}\n</think>\n"
-        full_content += content
+            full_content = f"<think>\n{reasoning}\n</think>\n{content}"
+        else:
+            full_content = content
 
         return {
             "content": full_content,
             "reasoning_content": reasoning,
             "final_content": content,
-            "prompt_tokens": usage.get("prompt_tokens", 0),
-            "completion_tokens": usage.get("completion_tokens", 0),
-            "total_tokens": usage.get("total_tokens", 0),
-            "finish_reason": choice.get("finish_reason", "unknown"),
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "finish_reason": finish_reason,
         }
 
     def ping(self) -> bool:
+        # For remote APIs (OpenRouter, etc.) skip /health check — just assume reachable
+        if "openrouter.ai" in self.server_url or "localhost" not in self.server_url and not self.server_url.startswith("http://192."):
+            return True
         try:
             r = requests.get(f"{self.server_url}/health", timeout=5)
             return r.status_code == 200
@@ -188,16 +301,192 @@ def ping(self) -> bool:
             return False
 
 
+# ─── Gemini Client ───────────────────────────────────────────────────────────
+
+class GeminiClient:
+    """Client for Google Gemini API (gemini-2.0-flash-lite or similar).
+
+    Accepts OpenAI-style message lists and converts them to Gemini's
+    generateContent format. Returns the same dict shape as LLMClient.generate()
+    so the rest of the benchmark code is backend-agnostic.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "gemini-2.0-flash-lite",
+        temperature: float = TEMPERATURE,
+    ):
+        self.api_key = api_key
+        self.model = model
+        self.temperature = temperature
+        self.base_url = "https://generativelanguage.googleapis.com/v1beta"
+
+    def _convert_messages(self, messages: list[dict]) -> tuple[Optional[dict], list[dict]]:
+        """Convert OpenAI-style messages to Gemini format.
+
+        Returns (system_instruction, contents) where:
+          - system_instruction is None or {"parts": [{"text": "..."}]}
+          - contents is a list of {"role": "user"|"model", "parts": [{"text": "..."}]}
+        """
+        system_instruction = None
+        contents = []
+
+        for msg in messages:
+            role = msg.get("role", "user")
+            text = msg.get("content", "")
+
+            if role == "system":
+                system_instruction = {"parts": [{"text": text}]}
+            elif role == "assistant":
+                contents.append({"role": "model", "parts": [{"text": text}]})
+            else:
+                contents.append({"role": "user", "parts": [{"text": text}]})
+
+        return system_instruction, contents
+
+    def generate(self, messages: list[dict], max_tokens: int) -> dict:
+        """
+        Send messages to Gemini generateContent endpoint.
+        Returns same dict format as LLMClient: content, prompt_tokens,
+        completion_tokens, total_tokens, finish_reason.
+        Uses exponential backoff on 429/503 errors.
+        """
+        max_tokens = max(1, max_tokens)
+        system_instruction, contents = self._convert_messages(messages)
+
+        payload = {
+            "contents": contents,
+            "generationConfig": {
+                "maxOutputTokens": max_tokens,
+                "temperature": self.temperature,
+            },
+        }
+        if system_instruction is not None:
+            payload["systemInstruction"] = system_instruction
+
+        url = (
+            f"{self.base_url}/models/{self.model}:generateContent"
+            f"?key={self.api_key}"
+        )
+
+        def _do_request():
+            resp = requests.post(url, json=payload, timeout=3600)
+            resp.raise_for_status()
+            return resp
+
+        resp = with_exponential_backoff(_do_request)
+        data = resp.json()
+
+        # Parse response
+        candidates = data.get("candidates", [])
+        if candidates:
+            candidate = candidates[0]
+            parts = candidate.get("content", {}).get("parts", [])
+            content = "".join(p.get("text", "") for p in parts)
+            finish_reason_raw = candidate.get("finishReason", "STOP")
+            # Normalise Gemini finish reasons to llama.cpp style
+            finish_reason_map = {
+                "STOP": "stop",
+                "MAX_TOKENS": "length",
+                "SAFETY": "stop",
+                "RECITATION": "stop",
+                "OTHER": "stop",
+            }
+            finish_reason = finish_reason_map.get(finish_reason_raw, "stop")
+        else:
+            content = ""
+            finish_reason = "stop"
+
+        usage = data.get("usageMetadata", {})
+        prompt_tokens = usage.get("promptTokenCount", 0)
+        completion_tokens = usage.get("candidatesTokenCount", 0)
+        total_tokens = usage.get("totalTokenCount", prompt_tokens + completion_tokens)
+
+        # Print a brief stream-alike indicator for consistency
+        print(f"    [gemini] {completion_tokens} tokens | finish={finish_reason}")
+        print(content[:120].replace("\n", " ") + ("..." if len(content) > 120 else ""))
+
+        return {
+            "content": content,
+            "reasoning_content": "",
+            "final_content": content,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "finish_reason": finish_reason,
+        }
+
+    def ping(self) -> bool:
+        """Health check — try a minimal generation."""
+        try:
+            resp = self.generate(
+                messages=[{"role": "user", "content": "Say OK."}],
+                max_tokens=10,
+            )
+            return bool(resp.get("content"))
+        except Exception:
+            return False
+
+
+# ─── Client Factory ──────────────────────────────────────────────────────────
+
+def create_client(
+    server_url: str,
+    api_key: str = None,
+    model_name: str = None,
+    temperature: float = TEMPERATURE,
+) -> Union[LLMClient, GeminiClient]:
+    """
+    Create appropriate client based on server_url scheme.
+
+    - gemini:// or google:// → GeminiClient
+      model extracted from the URL path (e.g. gemini://gemini-2.0-flash-lite)
+    - http:// or https://   → LLMClient (llama.cpp)
+
+    api_key is required for GeminiClient. Raises ValueError if missing.
+    """
+    if server_url.startswith("gemini://") or server_url.startswith("google://"):
+        # Extract model from URL: gemini://gemini-2.0-flash-lite → gemini-2.0-flash-lite
+        scheme = "gemini://" if server_url.startswith("gemini://") else "google://"
+        gemini_model = server_url[len(scheme):].strip("/") or "gemini-2.0-flash-lite"
+        # Allow model_name override
+        if model_name and not model_name.startswith("gemini"):
+            # model_name is just a label — still use gemini_model for the actual API call
+            pass
+        if not api_key:
+            raise ValueError(
+                "GeminiClient requires an API key. Pass --api-key or set GEMINI_API_KEY env var."
+            )
+        return GeminiClient(api_key=api_key, model=gemini_model, temperature=temperature)
+    elif server_url.startswith("openrouter://"):
+        # openrouter://openai/gpt-oss-120b:free → https://openrouter.ai/api/v1
+        or_model = server_url[len("openrouter://"):].strip("/")
+        if not api_key:
+            raise ValueError("OpenRouter requires an API key. Pass --api-key or set api_key_env.")
+        return LLMClient(
+            server_url="https://openrouter.ai/api",
+            temperature=temperature,
+            api_key=api_key,
+            model_name=or_model,
+        )
+    elif server_url.startswith("http"):
+        return LLMClient(server_url=server_url, temperature=temperature)
+    else:
+        raise ValueError(
+            f"Unrecognised server URL scheme: '{server_url}'. "
+            "Use http://, https://, openrouter://, gemini://, or google://"
+        )
+
+
 # ─── Parsing Helpers ─────────────────────────────────────────────────────────
 
 def extract_python_blocks(text: str) -> list[str]:
-    """Extract all ```python code blocks from text."""
     pattern = r"```python\s*\n(.*?)```"
     return re.findall(pattern, text, re.DOTALL)
 
 
 def extract_compact_call(text: str) -> Optional[str]:
-    """Extract <compact>...</compact> summary. Returns None if not found."""
     match = re.search(r"<compact>(.*?)</compact>", text, re.DOTALL)
     if match:
         return match.group(1).strip()
@@ -206,31 +495,26 @@ def extract_compact_call(text: str) -> Optional[str]:
 
 def extract_boxed_answer(text: str) -> Optional[int]:
     """Extract the last \\boxed{...} answer from text, ignoring <think> blocks."""
-    # Try outside <think> blocks first
     non_think = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
     target = non_think if non_think.strip() else text
 
     matches = re.findall(r"\\boxed\{([^{}]+)\}", target)
     if not matches:
-        # Fallback: try nested braces
         matches = re.findall(r"\\boxed\{(.+?)\}", target)
     if not matches:
         return None
 
     raw = matches[-1].strip()
-    # Try direct int parse
     try:
         return int(raw)
     except ValueError:
         pass
-    # Try float -> int
     try:
         f = float(raw)
         if f == int(f):
             return int(f)
     except ValueError:
         pass
-    # Try simple eval (e.g. "2^10" or "3*5")
     try:
         cleaned = raw.replace("^", "**").replace(",", "")
         return int(eval(cleaned))
@@ -239,25 +523,337 @@ def extract_boxed_answer(text: str) -> Optional[int]:
     return None
 
 
-# ─── Single Trial (one attempt at solving a problem) ─────────────────────────
+# ─── Prediction Phase ────────────────────────────────────────────────────────
+
+def run_prediction_phase(
+    client,
+    problem: dict,
+    max_tokens: int = 300,
+) -> dict:
+    """
+    Ask the model to predict its own performance before testing begins.
+    Returns parsed prediction dict with keys: success_prediction, n_reliable_prediction,
+    compaction_prompt, raw_response. Falls back to safe defaults on parse failure.
+    """
+    problem_text = problem.get("problem_text", "")
+    prompt = PREDICTION_PROMPT.format(problem_text=problem_text)
+    messages = [{"role": "user", "content": prompt}]
+
+    print(f"\n  [Prediction Phase] Asking model to predict performance...")
+    try:
+        resp = client.generate(messages, max_tokens=max_tokens)
+    except Exception as e:
+        print(f"  [Prediction Phase] API error: {e} — using defaults")
+        return _prediction_defaults(raw_response=f"ERROR: {e}")
+
+    raw = resp.get("content", "")
+    completion_tokens = resp.get("completion_tokens", 0)
+
+    if completion_tokens > max_tokens:
+        print(f"  [Prediction Phase] Response too long ({completion_tokens} > {max_tokens}) — using defaults")
+        return _prediction_defaults(raw_response=raw)
+
+    success_match = re.search(
+        r"<success_prediction>\s*(True|False)\s*</success_prediction>",
+        raw, re.IGNORECASE
+    )
+    if not success_match:
+        print("  [Prediction Phase] Missing <success_prediction> tag — using defaults")
+        return _prediction_defaults(raw_response=raw)
+    success_prediction = success_match.group(1).strip().lower() == "true"
+
+    n_reliable_match = re.search(
+        r"<n_reliable_prediction>\s*(\d+)\s*</n_reliable_prediction>",
+        raw
+    )
+    if not n_reliable_match:
+        print("  [Prediction Phase] Missing <n_reliable_prediction> tag — using defaults")
+        return _prediction_defaults(raw_response=raw)
+    n_reliable_prediction = int(n_reliable_match.group(1))
+
+    compaction_match = re.search(
+        r"<compaction_prompt>(.*?)</compaction_prompt>",
+        raw, re.DOTALL
+    )
+    if not compaction_match:
+        print("  [Prediction Phase] Missing <compaction_prompt> tag — using defaults")
+        return _prediction_defaults(raw_response=raw)
+    compaction_prompt = compaction_match.group(1).strip() or DEFAULT_COMPACTION_PROMPT
+
+    print(f"  [Prediction Phase] success={success_prediction}, n_reliable={n_reliable_prediction}, compaction='{compaction_prompt[:60]}'")
+
+    return {
+        "success_prediction": success_prediction,
+        "n_reliable_prediction": n_reliable_prediction,
+        "compaction_prompt": compaction_prompt,
+        "raw_response": raw,
+    }
+
+
+def _prediction_defaults(raw_response: str = "") -> dict:
+    """Return safe prediction defaults (n_reliable=None means infinity)."""
+    return {
+        "success_prediction": True,
+        "n_reliable_prediction": None,
+        "compaction_prompt": DEFAULT_COMPACTION_PROMPT,
+        "raw_response": raw_response,
+    }
+
+
+# ─── Scoring Engine ──────────────────────────────────────────────────────────
+
+def calculate_scores(results_dir: Optional[Path] = None) -> None:
+    """
+    Load all per-model result files and compute Scott's composite benchmark scores.
+
+    Scott's formula:
+        Per-problem score:
+            problem_score = baseline_n_reliable / n_reliable
+            prediction_score = baseline_n_reliable_prediction / n_reliable_prediction
+
+        Where baseline = lowest n_reliable (or n_reliable_prediction) across all models
+        that solved that problem (i.e. the best-performing model sets the baseline).
+
+        Composite scores:
+            composite = mean(problem_scores over all solved problems)
+            prediction_composite = mean(prediction_scores over all problems)
+
+        Coverage = problems_attempted / problems_eligible
+            eligible = problems where model context_max >= baseline_n_reliable
+
+        Accuracy = problems_solved / problems_attempted
+
+        Prediction accuracy = correct_success_predictions / total_problems
+            (correct = predicted True and solved, OR predicted False and unsolvable)
+
+        Final score = composite * prediction_composite * coverage * accuracy
+                    * prediction_accuracy * 10000
+
+        NOTE: FLOPs not tracked yet — omitted from scoring, noted in output.
+
+    Prints both a per-problem table and a per-model composite score table.
+    """
+    if results_dir is None:
+        results_dir = RESULTS_DIR
+    results_dir = Path(results_dir)
+
+    # ── Load all per-config result files (not summary files) ──
+    # File naming: results/{model_name}_{problem_id}_{config}.json
+    # or legacy: results/{problem_id}_{config}.json (no model prefix)
+    result_files = sorted(results_dir.glob("*.json"))
+    result_files = [f for f in result_files if not f.name.endswith("_summary.json")]
+
+    if not result_files:
+        print("No result files found. Run experiments first.")
+        return
+
+    # Structure: {model_name: {problem_id: {config_name: result_dict}}}
+    by_model: dict[str, dict[str, dict[str, dict]]] = {}
+
+    for rf in result_files:
+        try:
+            data = json.loads(rf.read_text())
+        except Exception as e:
+            print(f"  [scores] Could not read {rf.name}: {e}")
+            continue
+
+        model_name = data.get("model_name") or data.get("model") or "unknown"
+        pid = data.get("problem_id", rf.stem)
+        config = data.get("config", {})
+        config_name = config.get("name", "unknown") if isinstance(config, dict) else str(config)
+
+        by_model.setdefault(model_name, {}).setdefault(pid, {})[config_name] = data
+
+    if not by_model:
+        print("No parseable result files found.")
+        return
+
+    all_problem_ids = sorted({pid for m in by_model.values() for pid in m})
+
+    # ── Load fixed baselines from problem JSON files (gptoss_120b_correct_token_avg) ──
+    # This is the GPT-4o 120B reference — fixed, not dynamic across models.
+    # Using a fixed baseline ensures scores are stable and comparable across runs.
+    baseline_n_reliable: dict[str, Optional[int]] = {}
+    baseline_n_pred: dict[str, Optional[int]] = {}
+
+    problems_dir = Path(__file__).parent / "problems"
+    for pid in all_problem_ids:
+        prob_file = problems_dir / f"{pid}.json"
+        if prob_file.exists():
+            try:
+                prob_data = json.loads(prob_file.read_text())
+                token_avg = prob_data.get("gptoss_120b_correct_token_avg")
+                if token_avg is not None:
+                    baseline_n_reliable[pid] = int(token_avg)
+                    baseline_n_pred[pid] = int(token_avg)
+                else:
+                    baseline_n_reliable[pid] = None
+                    baseline_n_pred[pid] = None
+            except Exception:
+                baseline_n_reliable[pid] = None
+                baseline_n_pred[pid] = None
+        else:
+            baseline_n_reliable[pid] = None
+            baseline_n_pred[pid] = None
+
+    # ── Per-problem detail table ──
+    print(f"\n{'='*110}")
+    print(f"  AmnesiaBench v2 — Per-Problem Detail")
+    print(f"{'='*110}")
+    print(f"{'Model':<25} {'Problem':<28} {'Config':<22} {'MinWin':>7} {'Baseline':>8} {'ProbScore':>10} {'N_Pred':>8} {'PredScore':>10}")
+    print(f"{'-'*110}")
+
+    # ── Per-model composite score computation ──
+    model_scores = {}
+
+    for model_name in sorted(by_model.keys()):
+        model_data = by_model[model_name]
+        problem_scores = []
+        prediction_scores = []
+        total_problems = len(all_problem_ids)
+        problems_attempted = 0
+        problems_solved = 0
+        problems_eligible = 0
+        correct_success_preds = 0
+
+        for pid in all_problem_ids:
+            if pid not in model_data:
+                continue
+
+            baseline = baseline_n_reliable.get(pid)
+            base_pred = baseline_n_pred.get(pid)
+
+            # Count as eligible if baseline exists (some model solved it)
+            if baseline is not None:
+                problems_eligible += 1
+
+            # Use the best config for this problem (lowest minimum_window)
+            configs_for_pid = model_data[pid]
+            best_result = None
+            best_mw = None
+            for config_name, result in configs_for_pid.items():
+                mw = result.get("minimum_window")
+                if mw is not None:
+                    if best_mw is None or mw < best_mw:
+                        best_mw = mw
+                        best_result = result
+
+            if best_result is None:
+                # Model didn't solve this problem in any config
+                pred = list(configs_for_pid.values())[0].get("prediction", {}) or {}
+                success_pred = pred.get("success_prediction", True)
+                if not success_pred and baseline is None:
+                    correct_success_preds += 1  # correctly predicted failure
+                # Still attempted
+                problems_attempted += 1
+                continue
+
+            problems_attempted += 1
+            problems_solved += 1
+
+            # Problem score
+            if baseline is not None and best_mw is not None:
+                prob_score = baseline / best_mw
+            else:
+                prob_score = 0.0
+            problem_scores.append(prob_score)
+
+            # Prediction score
+            pred = best_result.get("prediction", {}) or {}
+            n_pred_val = pred.get("n_reliable_prediction")
+            success_pred = pred.get("success_prediction", True)
+
+            if success_pred:
+                correct_success_preds += 1  # correctly predicted success (and solved)
+
+            if n_pred_val is not None and base_pred is not None and n_pred_val > 0:
+                pred_score = base_pred / n_pred_val
+            else:
+                pred_score = 0.0
+            prediction_scores.append(pred_score)
+
+            prob_score_str = f"{prob_score:.3f}"
+            pred_score_str = f"{pred_score:.3f}" if n_pred_val is not None else "N/A"
+            baseline_str = str(baseline) if baseline is not None else "N/A"
+            n_pred_str = str(n_pred_val) if n_pred_val is not None else "inf"
+
+            # Use config name from best result
+            cfg = best_result.get("config", {})
+            cfg_name = cfg.get("name", "unknown") if isinstance(cfg, dict) else str(cfg)
+
+            print(
+                f"{model_name:<25} {pid:<28} {cfg_name:<22} {str(best_mw):>7} "
+                f"{baseline_str:>8} {prob_score_str:>10} {n_pred_str:>8} {pred_score_str:>10}"
+            )
+
+        # ── Composite scores ──
+        composite = sum(problem_scores) / len(problem_scores) if problem_scores else 0.0
+        pred_composite = sum(prediction_scores) / len(prediction_scores) if prediction_scores else 0.0
+        # Coverage: fraction of eligible problems the model attempted (capped at 1.0)
+        coverage = min(1.0, problems_solved / problems_eligible) if problems_eligible > 0 else 0.0
+        accuracy = problems_solved / problems_attempted if problems_attempted > 0 else 0.0
+        pred_accuracy = correct_success_preds / total_problems if total_problems > 0 else 0.0
+
+        final_score = composite * pred_composite * coverage * accuracy * pred_accuracy * 10000
+
+        model_scores[model_name] = {
+            "composite": composite,
+            "pred_composite": pred_composite,
+            "coverage": coverage,
+            "accuracy": accuracy,
+            "pred_accuracy": pred_accuracy,
+            "final_score": final_score,
+            "problems_attempted": problems_attempted,
+            "problems_solved": problems_solved,
+            "problems_eligible": problems_eligible,
+            "total_problems": total_problems,
+        }
+
+    # ── Per-model composite table ──
+    print(f"\n{'='*100}")
+    print(f"  AmnesiaBench v2 — Composite Scores (Scott's Formula)")
+    print(f"  NOTE: FLOPs not tracked — omitted from scoring.")
+    print(f"{'='*100}")
+    print(f"{'Model':<25} {'Composite':>10} {'PredComp':>10} {'Coverage':>9} {'Accuracy':>9} {'PredAcc':>8} {'FinalScore':>12}")
+    print(f"{'-'*100}")
+
+    for model_name in sorted(model_scores.keys()):
+        s = model_scores[model_name]
+        print(
+            f"{model_name:<25} "
+            f"{s['composite']:>10.4f} "
+            f"{s['pred_composite']:>10.4f} "
+            f"{s['coverage']:>9.3f} "
+            f"{s['accuracy']:>9.3f} "
+            f"{s['pred_accuracy']:>8.3f} "
+            f"{s['final_score']:>12.2f}"
+        )
+    print(f"{'='*100}")
+    print(f"\nFormula: final_score = composite × pred_composite × coverage × accuracy × pred_accuracy × 10000")
+    print(f"  composite = mean(baseline_n_reliable / model_n_reliable) over solved problems")
+    print(f"  pred_composite = mean(baseline_n_pred / model_n_pred) over all problems")
+    print(f"  coverage = attempted / eligible (eligible: baseline exists for problem)")
+    print(f"  accuracy = solved / attempted")
+    print(f"  pred_accuracy = correct_success_predictions / total_problems\n")
+
+
+# ─── Single Trial ─────────────────────────────────────────────────────────────
 
 @dataclass
 class Turn:
-    """One turn in the conversation."""
-    role: str                   # "system", "user", "assistant"
+    role: str
     content: str
-    tokens: Optional[int] = None          # completion_tokens (assistant only)
-    prompt_tokens: Optional[int] = None   # prompt_tokens at this point
-    total_tokens: Optional[int] = None    # total context at this point
+    tokens: Optional[int] = None
+    prompt_tokens: Optional[int] = None
+    total_tokens: Optional[int] = None
     finish_reason: Optional[str] = None
-    code_executed: Optional[str] = None   # code that was run (if any)
-    code_output: Optional[str] = None     # output from code (if any)
-    compact_summary: Optional[str] = None # summary extracted (if compact call)
+    code_executed: Optional[str] = None
+    code_output: Optional[str] = None
+    compact_summary: Optional[str] = None
 
 
 @dataclass
 class TrialResult:
-    """Full result of one trial."""
     problem_id: str
     correct_answer: int
     token_limit: int
@@ -266,19 +862,19 @@ class TrialResult:
     trial_idx: int
     success: bool
     answer: Optional[int]
-    total_tokens_peak: int      # peak total_tokens seen
+    total_tokens_peak: int
     n_turns: int
     n_compactions: int
     n_code_calls: int
     n_code_errors: int
     wall_time_s: float
     error: Optional[str]
-    finish_reason: str          # "solved", "truncated", "budget_exceeded", "max_turns", "error"
-    conversation: list = field(default_factory=list)  # list of Turn dicts
+    finish_reason: str
+    conversation: list = field(default_factory=list)
 
 
 def run_trial(
-    client: LLMClient,
+    client,
     problem_id: str,
     problem_text: str,
     correct_answer: int,
@@ -286,22 +882,21 @@ def run_trial(
     tir: bool,
     compaction: bool,
     trial_idx: int,
+    compaction_hint: str = "",
 ) -> TrialResult:
-    """Run one trial: try to solve the problem within the token budget."""
-
     t0 = time.time()
     sandbox = PythonSandbox() if tir else None
-    conversation: list[Turn] = []  # full trace
-    messages: list[dict] = []      # current API messages
+    conversation: list[Turn] = []
+    messages: list[dict] = []
     n_compactions = 0
     n_code_calls = 0
     n_code_errors = 0
     peak_tokens = 0
-    last_content = ""
     error_msg = None
     finish = "max_turns"
 
-    # Select system prompt
+    active_compaction_hint = compaction_hint.strip() if compaction_hint else DEFAULT_COMPACTION_PROMPT
+
     if compaction:
         sys_prompt = SYSTEM_COMPACT.format(
             token_limit=token_limit, max_compactions=MAX_COMPACTIONS
@@ -317,34 +912,29 @@ def run_trial(
     conversation.append(Turn(role="user", content=problem_text))
 
     for turn_i in range(MAX_TURNS):
-        # Calculate remaining budget
-        # We estimate prompt_tokens from the last known total.
-        # On the first call, we don't know yet — use a generous max_tokens.
         if peak_tokens > 0:
-            estimated_prompt = peak_tokens  # last total ≈ next prompt
-            remaining = token_limit - estimated_prompt
+            remaining = token_limit - peak_tokens
         else:
-            remaining = token_limit  # first call, let API figure it out
+            remaining = token_limit
 
         if remaining <= 0:
-            if compaction:
-                finish = "budget_exceeded"
-            else:
-                finish = "truncated"
+            finish = "budget_exceeded" if compaction else "truncated"
             break
 
-        # Generate
+        capped_tokens = min(remaining, MAX_COMPLETION_TOKENS)
         try:
-            resp = client.generate(messages, max_tokens=remaining)
+            resp = client.generate(messages, max_tokens=capped_tokens)
         except Exception as e:
             error_msg = f"API error: {e}"
             finish = "error"
             break
 
+        if resp["finish_reason"] in ("length", "truncated") and extract_boxed_answer(resp["content"]) is None:
+            finish = "truncated"
+
         content = resp["content"]
         total_now = resp["total_tokens"]
         peak_tokens = max(peak_tokens, total_now)
-        last_content = content
 
         turn = Turn(
             role="assistant",
@@ -354,54 +944,39 @@ def run_trial(
             total_tokens=total_now,
             finish_reason=resp["finish_reason"],
         )
-
         conversation.append(turn)
 
-        # ── Check for boxed answer FIRST (highest priority) ──
         answer = extract_boxed_answer(content)
         if answer is not None:
             finish = "solved"
             break
 
-        # ── Check for compact call ──
         compact_summary = extract_compact_call(content) if compaction else None
         if compact_summary is not None:
             turn.compact_summary = compact_summary
             n_compactions += 1
-
             if n_compactions > MAX_COMPACTIONS:
                 finish = "max_compactions"
                 break
-
-            # Reset conversation with summary
+            hint_line = f"\nHint: {active_compaction_hint}" if active_compaction_hint else ""
             restart_user_msg = POST_COMPACT_USER.format(
-                problem_text=problem_text,
+                problem_text=problem_text + hint_line,
                 summary=compact_summary,
             )
             messages = [
                 {"role": "system", "content": sys_prompt},
                 {"role": "user", "content": restart_user_msg},
             ]
-            # Reset peak tracking for new window
             peak_tokens = 0
-            conversation.append(Turn(
-                role="user",
-                content=f"[COMPACTION #{n_compactions} — context reset]",
-            ))
+            conversation.append(Turn(role="user", content=f"[COMPACTION #{n_compactions} — context reset]"))
             continue
 
-        # ── Check budget exceeded (compaction mode = hard fail) ──
         if total_now >= token_limit:
-            if compaction:
-                finish = "budget_exceeded"
-            else:
-                finish = "truncated"
+            finish = "budget_exceeded" if compaction else "truncated"
             break
 
-        # ── Check for python code blocks (TIR mode) ──
         code_blocks = extract_python_blocks(content) if tir else []
         if code_blocks:
-            # Execute ALL code blocks in order (variables persist)
             all_outputs = []
             for code in code_blocks:
                 n_code_calls += 1
@@ -410,11 +985,8 @@ def run_trial(
                     n_code_errors += 1
                 all_outputs.append(output)
             combined_output = "\n---\n".join(all_outputs)
-
-            # Truncate long output
             if len(combined_output) > 2000:
                 combined_output = combined_output[:2000] + "\n... (truncated)"
-
             code_turn = Turn(
                 role="user",
                 content=f"Code output:\n{combined_output}",
@@ -426,13 +998,11 @@ def run_trial(
             messages.append({"role": "user", "content": f"Code output:\n{combined_output}"})
             continue
 
-        # ── No code, no answer — prompt to continue ──
         messages.append({"role": "assistant", "content": content})
         messages.append({"role": "user", "content": "Continue solving."})
         conversation.append(Turn(role="user", content="Continue solving."))
 
-    # ── Extract answer ──
-    # Try to find answer from the full conversation
+    # Extract final answer from conversation
     answer = None
     for t in reversed(conversation):
         if t.role == "assistant":
@@ -468,9 +1038,8 @@ def run_trial(
 
 @dataclass
 class WindowTest:
-    """Result of testing one window size."""
     window: int
-    trials: list  # list of TrialResult dicts
+    trials: list
     n_success: int
     n_trials: int
     pass_rate: float
@@ -478,7 +1047,7 @@ class WindowTest:
 
 
 def binary_search(
-    client: LLMClient,
+    client,
     problem_id: str,
     problem_text: str,
     correct_answer: int,
@@ -487,11 +1056,8 @@ def binary_search(
     min_window: int = MIN_WINDOW,
     max_window: int = MAX_WINDOW,
     trials: int = TRIALS_PER_WINDOW,
+    compaction_hint: str = "",
 ) -> dict:
-    """
-    Binary search (log scale) for minimum context window.
-    Returns full results dict with all trials.
-    """
     config_name = f"{'Compact' if compaction else 'HardCut'}"
     print(f"\n{'='*60}")
     print(f"  {problem_id} | {config_name}")
@@ -500,11 +1066,11 @@ def binary_search(
 
     search_log: list[WindowTest] = []
 
-    # First: verify solvable at max window
+    # Verify solvable at max window
     print(f"\n  [Verify] Testing max window = {max_window} ...")
     test = _test_window(
         client, problem_id, problem_text, correct_answer,
-        max_window, tir, compaction, trials
+        max_window, tir, compaction, trials, compaction_hint
     )
     search_log.append(test)
     print(f"  [Verify] {test.n_success}/{test.n_trials} passed ({test.pass_rate:.0%})")
@@ -517,23 +1083,19 @@ def binary_search(
             search_range_final=(min_window, max_window),
         )
 
-    # Binary search
     lo, hi = min_window, max_window
     step = 0
-    while hi / lo > CONVERGENCE_RATIO:
+    while hi / lo > CONVERGENCE_RATIO and (hi - lo) > CONVERGENCE_ABS:
         step += 1
-        mid = int(math.exp((math.log(lo) + math.log(hi)) / 2))
-        # Snap to multiples of 64 for cleanliness
-        mid = max(min_window, (mid // 64) * 64)
-
-        # Avoid re-testing same values
+        mid = (lo + hi) // 2
+        mid = max(min_window, max(1, (mid // 16) * 16))
         if mid == lo or mid == hi:
             break
 
-        print(f"\n  [Step {step}] Testing window = {mid}  (range [{lo}, {hi}], ratio {hi/lo:.3f})")
+        print(f"\n  [Step {step}] Testing window = {mid}  (range [{lo}, {hi}], gap {hi-lo}, ratio {hi/lo:.3f})")
         test = _test_window(
             client, problem_id, problem_text, correct_answer,
-            mid, tir, compaction, trials
+            mid, tir, compaction, trials, compaction_hint
         )
         search_log.append(test)
         print(f"  [Step {step}] {test.n_success}/{test.n_trials} passed ({test.pass_rate:.0%}) → {'hi=mid' if test.passed else 'lo=mid'}")
@@ -544,7 +1106,6 @@ def binary_search(
             lo = mid
 
     print(f"\n  RESULT: minimum window ≈ {hi} tokens (range [{lo}, {hi}])")
-
     return _build_result(
         problem_id, tir, compaction, search_log,
         minimum_window=hi,
@@ -555,20 +1116,17 @@ def binary_search(
 def _test_window(
     client, problem_id, problem_text, correct_answer,
     window, tir, compaction, n_trials,
+    compaction_hint: str = "",
 ) -> WindowTest:
-    """Run N trials at a given window size, in parallel."""
     t0 = time.time()
 
     def _run_one(i):
         return run_trial(
             client, problem_id, problem_text, correct_answer,
-            token_limit=window,
-            tir=tir,
-            compaction=compaction,
-            trial_idx=i,
+            token_limit=window, tir=tir, compaction=compaction,
+            trial_idx=i, compaction_hint=compaction_hint,
         )
 
-    # Run all trials in parallel (server has enough slots)
     trials_results = [None] * n_trials
     n_success = 0
     with ThreadPoolExecutor(max_workers=n_trials) as pool:
@@ -587,12 +1145,9 @@ def _run_one(i):
     pass_rate = n_success / n_trials
     print(f"    [{n_trials} trials in {elapsed:.1f}s wall, {n_success}/{n_trials} passed]")
     return WindowTest(
-        window=window,
-        trials=trials_results,
-        n_success=n_success,
-        n_trials=n_trials,
-        pass_rate=pass_rate,
-        passed=pass_rate >= SUCCESS_THRESHOLD,
+        window=window, trials=trials_results,
+        n_success=n_success, n_trials=n_trials,
+        pass_rate=pass_rate, passed=pass_rate >= SUCCESS_THRESHOLD,
     )
 
 
@@ -611,15 +1166,13 @@ def _build_result(problem_id, tir, compaction, search_log, minimum_window, searc
     }
 
 
-# ─── Main ────────────────────────────────────────────────────────────────────
+# ─── Problem Loading ─────────────────────────────────────────────────────────
 
 def load_problem(problem_id: str) -> dict:
-    """Load a problem JSON from the problems/ directory."""
-    # Try exact match
+    """Load a problem JSON from problems/. Matches exact stem or substring."""
     path = PROBLEMS_DIR / f"{problem_id}.json"
     if path.exists():
         return json.loads(path.read_text())
-    # Try fuzzy match (e.g., "ab507a9f" matches "aimo3_hard_ab507a9f.json")
     for p in PROBLEMS_DIR.glob("*.json"):
         if problem_id in p.stem:
             return json.loads(p.read_text())
@@ -630,19 +1183,42 @@ def load_all_problems() -> list[dict]:
     return [json.loads(p.read_text()) for p in sorted(PROBLEMS_DIR.glob("*.json"))]
 
 
+# ─── Result Filename Helpers ─────────────────────────────────────────────────
+
+def result_filename(model_name: str, problem_id: str, config_name: str) -> Path:
+    """
+    Build result file path for a given model/problem/config combination.
+    Format: results/{model_name}_{problem_id}_{config_name}.json
+    Model name is sanitized (spaces → underscores, slashes → dashes).
+    """
+    safe_model = re.sub(r"[^\w\-]", "_", model_name)
+    return RESULTS_DIR / f"{safe_model}_{problem_id}_{config_name}.json"
+
+
+def summary_filename(model_name: str, problem_id: str) -> Path:
+    safe_model = re.sub(r"[^\w\-]", "_", model_name)
+    return RESULTS_DIR / f"{safe_model}_{problem_id}_summary.json"
+
+
+# ─── Single-Problem Runner ───────────────────────────────────────────────────
+
 def run_problem(
-    client: LLMClient,
+    client,
     problem: dict,
+    model_name: str = "unknown",
     configs: list[tuple[bool, bool]] = None,
     min_window: int = MIN_WINDOW,
     max_window: int = MAX_WINDOW,
     trials: int = TRIALS_PER_WINDOW,
 ):
-    """Run binary search for all configs on one problem. Save results."""
+    """
+    Run binary search for all configs on one problem. Save per-config and summary results.
+    Results namespaced by model_name to prevent multi-model collisions.
+    """
     if configs is None:
         configs = [
-            (False, False),  # Hard Cutoff (no tools)
-            (False, True),   # Compaction (compact tool only)
+            (False, False),  # NoTIR + HardCut
+            (False, True),   # NoTIR + Compact
         ]
 
     pid = problem["problem_id"]
@@ -650,7 +1226,39 @@ def run_problem(
 
     all_results = []
     for tir, compaction in configs:
-        config_name = f"{'Compact' if compaction else 'HardCut'}"
+        config_name = f"{'TIR' if tir else 'NoTIR'}_{'Compact' if compaction else 'HardCut'}"
+        outpath = result_filename(model_name, pid, config_name)
+
+        # Resume: skip if valid completed result exists
+        if outpath.exists():
+            try:
+                existing = json.loads(outpath.read_text())
+                if existing.get("minimum_window") is not None or existing.get("binary_search"):
+                    print(f"\n  [SKIP] {model_name} | {pid} | {config_name} — result exists at {outpath.name}")
+                    all_results.append(existing)
+                    continue
+            except Exception:
+                pass
+
+        # Prediction phase
+        prediction = run_prediction_phase(client, problem, max_tokens=300)
+        compaction_hint = prediction.get("compaction_prompt", DEFAULT_COMPACTION_PROMPT)
+
+        if not prediction.get("success_prediction", True):
+            print(f"\n  [Prediction Phase] Model opted out. Skipping binary search for {pid} | {config_name}.")
+            result = _build_result(
+                pid, tir, compaction, [],
+                minimum_window=None,
+                search_range_final=(min_window, max_window),
+            )
+            result["prediction"] = prediction
+            result["model_name"] = model_name
+            all_results.append(result)
+            outpath.write_text(json.dumps(result, indent=2, default=str))
+            print(f"\n  Saved (opt-out): {outpath.name}")
+            continue
+
+        # Binary search
         result = binary_search(
             client,
             problem_id=pid,
@@ -661,87 +1269,237 @@ def run_problem(
             min_window=min_window,
             max_window=max_window,
             trials=trials,
+            compaction_hint=compaction_hint,
         )
-        result["model"] = "Qwen3.5-35B-A3B-Q4_K_M"
+        result["model_name"] = model_name
+        result["prediction"] = prediction
         all_results.append(result)
 
-        # Save per-config result (with full traces)
-        outpath = RESULTS_DIR / f"{pid}_{config_name}.json"
         outpath.write_text(json.dumps(result, indent=2, default=str))
-        print(f"\n  Saved: {outpath}")
+        print(f"\n  Saved: {outpath.name}")
 
-    # Save combined summary (without conversation traces for readability)
+    # Save combined summary (compact, no conversation traces)
     summary = []
     for r in all_results:
-        summary.append({
+        entry = {
+            "model_name": model_name,
             "problem_id": r["problem_id"],
-            "config": r["config"]["name"],
+            "config": r["config"]["name"] if isinstance(r.get("config"), dict) else r.get("config"),
             "minimum_window": r["minimum_window"],
-            "search_range_final": r["search_range_final"],
-            "steps": len(r["binary_search"]),
-        })
+            "search_range_final": r.get("search_range_final"),
+            "steps": len(r.get("binary_search", [])),
+        }
+        pred = r.get("prediction")
+        if pred:
+            entry["n_reliable_prediction"] = pred.get("n_reliable_prediction")
+            entry["success_prediction"] = pred.get("success_prediction")
+        summary.append(entry)
 
-    summary_path = RESULTS_DIR / f"{pid}_summary.json"
-    summary_path.write_text(json.dumps(summary, indent=2))
-    print(f"\n  Summary: {summary_path}")
+    sp = summary_filename(model_name, pid)
+    sp.write_text(json.dumps(summary, indent=2))
+    print(f"\n  Summary: {sp.name}")
 
     return all_results
 
 
+# ─── Multi-Model Runner ──────────────────────────────────────────────────────
+
+def load_models_json() -> list[dict]:
+    """Load models.json from the AmnesiaBench directory. Returns list of {name, url} dicts."""
+    if not MODELS_JSON.exists():
+        raise FileNotFoundError(
+            f"models.json not found at {MODELS_JSON}. "
+            "Create it with a list of {{name, url}} entries."
+        )
+    models = json.loads(MODELS_JSON.read_text())
+    if not isinstance(models, list) or not models:
+        raise ValueError("models.json must be a non-empty list of {name, url} objects.")
+    for m in models:
+        if "name" not in m or "url" not in m:
+            raise ValueError(f"Each model entry must have 'name' and 'url' keys. Got: {m}")
+    return models
+
+
+def run_all_models(
+    problems: list[dict],
+    configs: list[tuple[bool, bool]] = None,
+    min_window: int = MIN_WINDOW,
+    max_window: int = MAX_WINDOW,
+    trials: int = TRIALS_PER_WINDOW,
+    temperature: float = TEMPERATURE,
+    cli_api_key: str = None,
+):
+    """
+    Iterate over all models in models.json, run all problems for each model.
+    Models are run sequentially (one model at a time, all problems per model).
+    If a model's server is unreachable, it is skipped with a warning.
+    Supports api_key_env field in models.json for Gemini-style API key lookup.
+    """
+    models = load_models_json()
+    print(f"\n{'#'*70}")
+    print(f"  --run-all-models: {len(models)} model(s) × {len(problems)} problem(s)")
+    for m in models:
+        print(f"    {m['name']} → {m['url']}")
+    print(f"{'#'*70}\n")
+
+    for model_entry in models:
+        mname = model_entry["name"]
+        murl = model_entry["url"]
+        print(f"\n{'#'*70}")
+        print(f"  MODEL: {mname}")
+        print(f"  URL:   {murl}")
+        print(f"{'#'*70}")
+
+        # Resolve API key: cli flag > api_key_env field > env var GEMINI_API_KEY
+        api_key = cli_api_key
+        api_key_env = model_entry.get("api_key_env")
+        if not api_key and api_key_env:
+            api_key = os.environ.get(api_key_env)
+            if api_key:
+                print(f"  API key resolved from env var: {api_key_env}")
+            else:
+                print(f"  WARNING: api_key_env='{api_key_env}' not found in environment")
+
+        try:
+            client = create_client(
+                server_url=murl,
+                api_key=api_key,
+                model_name=mname,
+                temperature=temperature,
+            )
+        except ValueError as e:
+            print(f"  ERROR: Could not create client for {mname}: {e} — skipping")
+            continue
+
+        if not client.ping():
+            print(f"  WARNING: Cannot reach server at {murl} — skipping {mname}")
+            continue
+
+        print(f"  Server OK: {murl}")
+        for problem in problems:
+            print(f"\n{'='*60}")
+            print(f"  PROBLEM: {problem['problem_id']}")
+            print(f"  Answer: {problem['ground_truth']}")
+            print(f"{'='*60}")
+            run_problem(
+                client, problem,
+                model_name=mname,
+                configs=configs,
+                min_window=min_window,
+                max_window=max_window,
+                trials=trials,
+            )
+
+    print("\n\nAll models done. Run --scores for composite scoring table.")
+
+
+# ─── Analysis ────────────────────────────────────────────────────────────────
+
 def analyze_results():
-    """Print a summary table of all completed results."""
-    results_files = sorted(RESULTS_DIR.glob("*_summary.json"))
-    if not results_files:
+    """Print a per-model summary table of all completed results."""
+    summary_files = sorted(RESULTS_DIR.glob("*_summary.json"))
+    if not summary_files:
         print("No results found. Run experiments first.")
         return
 
-    print(f"\n{'Problem':<30} {'Config':<20} {'Min Window':>10} {'Range':>16}")
-    print("-" * 80)
-    for f in results_files:
-        data = json.loads(f.read_text())
+    print(f"\n{'Model':<25} {'Problem':<30} {'Config':<24} {'Min Window':>10} {'Range':>18} {'Steps':>6}")
+    print("-" * 118)
+
+    current_model = None
+    for f in summary_files:
+        try:
+            data = json.loads(f.read_text())
+        except Exception as e:
+            print(f"  [analyze] Could not read {f.name}: {e}")
+            continue
+
         for entry in data:
-            lo, hi = entry["search_range_final"]
-            mw = entry["minimum_window"]
+            model = entry.get("model_name", "unknown")
+            pid = entry.get("problem_id", "?")
+            config = entry.get("config", "?")
+            mw = entry.get("minimum_window")
             mw_str = str(mw) if mw is not None else "UNSOLVABLE"
-            print(f"{entry['problem_id']:<30} {entry['config']:<20} {mw_str:>10} [{lo:>6}, {hi:>6}]")
+            sr = entry.get("search_range_final", ["-", "-"])
+            lo, hi = sr if sr else ("-", "-")
+            steps = entry.get("steps", "?")
+
+            if model != current_model:
+                if current_model is not None:
+                    print()
+                current_model = model
+
+            print(f"{model:<25} {pid:<30} {config:<24} {mw_str:>10} [{str(lo):>6}, {str(hi):>6}] {str(steps):>6}")
+
+
+# ─── Main ────────────────────────────────────────────────────────────────────
+
+def derive_model_name(url: str) -> str:
+    """Derive a short model name from the server URL."""
+    url = url.rstrip("/")
+    # For gemini:// URLs extract the model name directly
+    if url.startswith("gemini://") or url.startswith("google://"):
+        scheme = "gemini://" if url.startswith("gemini://") else "google://"
+        return url[len(scheme):].strip("/") or "gemini"
+    # Extract host:port, replace dots/colons with underscores
+    host_port = url.split("//")[-1]
+    return re.sub(r"[^\w]", "_", host_port)
 
 
 def main():
-    parser = argparse.ArgumentParser(description="AmnesiaBench — context window binary search")
+    parser = argparse.ArgumentParser(description="AmnesiaBench v2 — multi-model context window benchmark")
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument("--problem", type=str, help="Problem ID (or substring) to test")
     group.add_argument("--all", action="store_true", help="Run all problems")
     group.add_argument("--analyze", action="store_true", help="Analyze existing results")
+    group.add_argument("--scores", action="store_true", help="Print composite Scott scoring table")
+
+    parser.add_argument("--model", type=str, default=SERVER_URL,
+                        help=f"Server URL (default: {SERVER_URL}). Use gemini://MODEL for Gemini.")
+    parser.add_argument("--model-name", type=str, default=None,
+                        help="Label for this model in results (default: derived from --model URL)")
+    parser.add_argument("--api-key", type=str, default=None,
+                        help="API key for Gemini backends (overrides GEMINI_API_KEY env var)")
+    parser.add_argument("--run-all-models", action="store_true",
+                        help="Iterate over all models in models.json (overrides --model/--model-name)")
 
-    parser.add_argument("--server", type=str, default=SERVER_URL, help="llama.cpp server URL")
     parser.add_argument("--min-window", type=int, default=MIN_WINDOW)
     parser.add_argument("--max-window", type=int, default=MAX_WINDOW)
     parser.add_argument("--trials", type=int, default=TRIALS_PER_WINDOW)
     parser.add_argument("--temperature", type=float, default=TEMPERATURE)
     parser.add_argument("--config", type=str, default=None,
                         help="Run specific config only: NoTIR_HardCut, TIR_HardCut, NoTIR_Compact, TIR_Compact")
+    parser.add_argument("--results-dir", type=str, default=None,
+                        help="Results directory for --scores / --analyze (default: ./results)")
 
     args = parser.parse_args()
 
+    # Redirect results dir if specified
+    if args.results_dir:
+        global RESULTS_DIR
+        RESULTS_DIR = Path(args.results_dir)
+
     if args.analyze:
         analyze_results()
         return
 
+    if args.scores:
+        rd = Path(args.results_dir) if args.results_dir else None
+        calculate_scores(rd)
+        return
+
     min_window = args.min_window
     max_window = args.max_window
     trials_per_window = args.trials
 
-    client = LLMClient(server_url=args.server, temperature=args.temperature)
-    if not client.ping():
-        print(f"ERROR: Cannot reach llama.cpp server at {args.server}")
-        print(f"Start it first:\n  llama-server --model <path> --host 0.0.0.0 --port 8080 --ctx-size 65536")
-        sys.exit(1)
-    print(f"Server OK: {args.server}")
-
-    # Parse config filter
+    # Config filter
     configs = None
     if args.config:
         config_map = {
+            "NoTIR_HardCut": (False, False),
+            "TIR_HardCut": (True, False),
+            "NoTIR_Compact": (False, True),
+            "TIR_Compact": (True, True),
+            # Legacy short names
             "HardCut": (False, False),
             "Compact": (False, True),
         }
@@ -750,15 +1508,62 @@ def main():
             sys.exit(1)
         configs = [config_map[args.config]]
 
+    # Load problems
     if args.all:
         problems = load_all_problems()
     else:
         problems = [load_problem(args.problem)]
 
+    # Resolve API key: --api-key > scheme-specific env var > GEMINI_API_KEY fallback
+    _model_url = args.model
+    if args.api_key:
+        api_key = args.api_key
+    elif _model_url.startswith("openrouter://"):
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+    else:
+        api_key = os.environ.get("GEMINI_API_KEY")
+
+    # Multi-model mode
+    if args.run_all_models:
+        run_all_models(
+            problems=problems,
+            configs=configs,
+            min_window=min_window,
+            max_window=max_window,
+            trials=trials_per_window,
+            temperature=args.temperature,
+            cli_api_key=api_key,
+        )
+        return
+
+    # Single-model mode
+    model_url = args.model
+    model_name = args.model_name or derive_model_name(model_url)
+
+    try:
+        client = create_client(
+            server_url=model_url,
+            api_key=api_key,
+            model_name=model_name,
+            temperature=args.temperature,
+        )
+    except ValueError as e:
+        print(f"ERROR: {e}")
+        sys.exit(1)
+
+    if not client.ping():
+        print(f"ERROR: Cannot reach server at {model_url}")
+        if model_url.startswith("http"):
+            print(f"Start it first:\n  llama-server --model <path> --host 0.0.0.0 --port 8080 --ctx-size 65536")
+        else:
+            print(f"Check your API key and model name.")
+        sys.exit(1)
+    print(f"Server OK: {model_url}  (model_name: {model_name})")
+
     print(f"Problems: {[p['problem_id'] for p in problems]}")
     print(f"Search range: [{min_window}, {max_window}]")
     print(f"Trials per window: {trials_per_window}")
-    print(f"Configs: {[c for c in (configs or [(False,False),(False,True)])]}")
+    print(f"Configs: {configs or [(False,False),(False,True)]}")
     print()
 
     for problem in problems:
@@ -767,11 +1572,16 @@ def main():
         print(f"  Answer: {problem['ground_truth']}")
         print(f"  120B pass rate: {problem.get('gptoss_120b_pass_rate', '?')}")
         print(f"{'#'*60}")
-        run_problem(client, problem, configs=configs,
-                    min_window=min_window, max_window=max_window,
-                    trials=trials_per_window)
+        run_problem(
+            client, problem,
+            model_name=model_name,
+            configs=configs,
+            min_window=min_window,
+            max_window=max_window,
+            trials=trials_per_window,
+        )
 
-    print("\n\nAll done. Run --analyze to see summary.")
+    print("\n\nAll done. Run --analyze to see summary or --scores for composite score table.")
 
 
 if __name__ == "__main__":
diff --git a/models.json b/models.json
new file mode 100644
index 0000000..3d9105d
--- /dev/null
+++ b/models.json
@@ -0,0 +1,32 @@
+[
+  {
+    "name": "Qwen3.5-35B-A3B-Q4",
+    "url": "http://localhost:8080",
+    "api_key_env": null
+  },
+  {
+    "name": "gemini-3.1-flash-lite-preview",
+    "url": "gemini://gemini-3.1-flash-lite-preview",
+    "api_key_env": "GEMINI_API_KEY"
+  },
+  {
+    "name": "openai/gpt-oss-120b",
+    "url": "openrouter://openai/gpt-oss-120b:free",
+    "api_key_env": "OPENROUTER_API_KEY"
+  },
+  {
+    "name": "openai/gpt-oss-20b",
+    "url": "openrouter://openai/gpt-oss-20b:free",
+    "api_key_env": "OPENROUTER_API_KEY"
+  },
+  {
+    "name": "nvidia/nemotron-3-super-120b",
+    "url": "openrouter://nvidia/nemotron-3-super-120b-a12b:free",
+    "api_key_env": "OPENROUTER_API_KEY"
+  },
+  {
+    "name": "qwen/qwen3-next-80b",
+    "url": "openrouter://qwen/qwen3-next-80b-a3b-instruct:free",
+    "api_key_env": "OPENROUTER_API_KEY"
+  }
+]
diff --git a/problems/aimo3_hard_00eaa992.json b/problems/aimo3_hard_00eaa992.json
new file mode 100644
index 0000000..6e1cbeb
--- /dev/null
+++ b/problems/aimo3_hard_00eaa992.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_00eaa992",
+  "problem_text": "If \\((60-a)(60-b)(60-c)(60-d)(60-e) = 1025\\), what is the value of \\(a + b + c + d\\)?\n\nNote: 1025 is divisible by 5.",
+  "ground_truth": "188",
+  "topic": "number_theory",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_2e0b7ba3.json b/problems/aimo3_hard_2e0b7ba3.json
new file mode 100644
index 0000000..faa1763
--- /dev/null
+++ b/problems/aimo3_hard_2e0b7ba3.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_2e0b7ba3",
+  "problem_text": "How many ways are there to paint the first level of the Th\u00e1p R\u00f9a tower model, given that the $3$ doorways at the front are painted with the same color and each of the remaining $7$ doorways is painted with one of the three colors such that any two adjacent doorways with a common side on the same level are painted with different colors?",
+  "ground_truth": "216",
+  "topic": "combinatorics",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_5f9595ae.json b/problems/aimo3_hard_5f9595ae.json
new file mode 100644
index 0000000..973d181
--- /dev/null
+++ b/problems/aimo3_hard_5f9595ae.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_5f9595ae",
+  "problem_text": "In an acute triangle \\(ABC\\) with \\(\\angle A = 30^\\circ\\), a circle with diameter \\(BC\\) intersects \\(AB\\) and \\(AC\\) at points \\(D\\) and \\(E\\), respectively. Find the ratio of the area of \\(\\triangle ADC\\) to the area of quadrilateral \\(DBCE\\).",
+  "ground_truth": "3",
+  "topic": "geometry",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_858cba58.json b/problems/aimo3_hard_858cba58.json
new file mode 100644
index 0000000..9c7bd09
--- /dev/null
+++ b/problems/aimo3_hard_858cba58.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_858cba58",
+  "problem_text": "All vertices of a pyramid lie on the facets of a cube but not on its edges, and each facet contains at least one vertex. What is the maximum possible number of the vertices of the pyramid?",
+  "ground_truth": "13",
+  "topic": "other",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_b0dc264b.json b/problems/aimo3_hard_b0dc264b.json
new file mode 100644
index 0000000..22a7c54
--- /dev/null
+++ b/problems/aimo3_hard_b0dc264b.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_b0dc264b",
+  "problem_text": "Given the group $(G, *)$ with $G = \\{a, b, c, d, f, g, h, k\\}$ and identity $k$, and the following operations:\n- $a * b = c$\n- $b * a = d$\n- $f * f = a$\n- $g * g = b$\n- $h * h = c$\n\nHow many self-inverses does $(G, *)$ have?",
+  "ground_truth": "2",
+  "topic": "other",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_b1da52fa.json b/problems/aimo3_hard_b1da52fa.json
new file mode 100644
index 0000000..2f7683a
--- /dev/null
+++ b/problems/aimo3_hard_b1da52fa.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_b1da52fa",
+  "problem_text": "Drunk and Horse play a game on a $2023! \\times 2023!$ grid. Horse chooses a positive integer $k < 2023$ and lights up each square using $k$ different colors. Horse starts on a random square, and Drunk starts on another square exactly $2023^2$ squares away in one direction. In each turn, Drunk moves one square in some direction but cannot move in the same direction for 2023 consecutive moves. Horse moves up to $m$ squares in a direction of its choice, where $m = 2023 - k$. Horse gets alerted of the color of the square Drunk was previously on. Horse wins if it ends up on a square that shares at least one vertex with Drunk's square. What is the minimum number of turns in which Horse can guarantee a win?",
+  "ground_truth": "2030",
+  "topic": "combinatorics",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file
diff --git a/problems/aimo3_hard_f728b4b1.json b/problems/aimo3_hard_f728b4b1.json
new file mode 100644
index 0000000..25e7e94
--- /dev/null
+++ b/problems/aimo3_hard_f728b4b1.json
@@ -0,0 +1,10 @@
+{
+  "problem_id": "aimo3_hard_f728b4b1",
+  "problem_text": "Triangle $ABC$ is constructed such that $AB>BC>AC$. Points $E$, $F$, and $G$ are drawn from $A$, $B$, and $C$ to their opposite sides respectively. If $\\frac{BE}{EC} = \\frac{7}{12}$, the ratio of $\\frac{AF}{FC}$ can be represented as $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. What is the smallest possible sum of $m+n$?",
+  "ground_truth": "26",
+  "topic": "geometry",
+  "source": "aimo3_hard",
+  "gptoss_20b_pass_rate": 0.125,
+  "gptoss_20b_n_correct": 2,
+  "gptoss_20b_correct_token_avg": null
+}
\ No newline at end of file

From 3ff7db74585daf266bf15a30c551eee8d0ec7108 Mon Sep 17 00:00:00 2001
From: "Bubba (AmnesiaBench bot)" <bubba@voynich.ai>
Date: Sun, 29 Mar 2026 02:09:20 -0400
Subject: [PATCH 2/2] fix: OpenRouter rate limiting and prompt caching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Increase backoff retries 5→20, max_delay 60s→120s
- Respect Retry-After / x-ratelimit-reset-requests headers
- Add X-OpenRouter-Cache: true header for prompt caching on repeated system prompts
- Drop parallelism to max 3 per model with staggered launches to avoid 429 bursts
---
 amnesia_bench.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/amnesia_bench.py b/amnesia_bench.py
index 854be4c..c7057d5 100644
--- a/amnesia_bench.py
+++ b/amnesia_bench.py
@@ -177,9 +177,10 @@ def reset(self):
 
 # ─── Exponential Backoff ─────────────────────────────────────────────────────
 
-def with_exponential_backoff(fn, max_retries=5, base_delay=1.0, max_delay=60.0):
+def with_exponential_backoff(fn, max_retries=20, base_delay=2.0, max_delay=120.0):
     """
     Wrap any API call with exponential backoff on 429/503 errors.
+    Respects Retry-After header when present.
     Uses full jitter: delay = min(base * 2^attempt + uniform(0,1), max_delay).
     Raises immediately on non-retriable errors or when retries are exhausted.
     """
@@ -188,7 +189,15 @@ def with_exponential_backoff(fn, max_retries=5, base_delay=1.0, max_delay=60.0):
             return fn()
         except requests.HTTPError as e:
             if e.response.status_code in (429, 503) and attempt < max_retries - 1:
-                delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
+                # Respect Retry-After header if present
+                retry_after = e.response.headers.get("Retry-After") or e.response.headers.get("x-ratelimit-reset-requests")
+                if retry_after:
+                    try:
+                        delay = float(retry_after)
+                    except ValueError:
+                        delay = min(base_delay * (2 ** attempt) + random.uniform(0, 2), max_delay)
+                else:
+                    delay = min(base_delay * (2 ** attempt) + random.uniform(0, 2), max_delay)
                 print(f"    [backoff] {e.response.status_code} — retrying in {delay:.1f}s (attempt {attempt+1}/{max_retries})")
                 time.sleep(delay)
             else:
@@ -222,9 +231,13 @@ def generate(self, messages: list[dict], max_tokens: int) -> dict:
         def _do_request():
             if self.model_name:
                 payload["model"] = self.model_name
+            headers = dict(self.auth_header)
+            # Enable prompt caching for OpenRouter (reduces cost + latency on repeated prompts)
+            if "openrouter.ai" in self.server_url:
+                headers["X-OpenRouter-Cache"] = "true"
             resp = requests.post(
                 f"{self.server_url}/v1/chat/completions",
-                headers=self.auth_header,
+                headers=headers,
                 json=payload,
                 timeout=3600,
                 stream=True,