OasAIStudio · Octane0411 · Mar 23, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.gitignore b/.gitignore
@@ -66,6 +66,10 @@ RELEASE_NOTES.md
 jobs/
 trajectories/
 run-bench-test.sh
+benchmark/terminalbench/.local-tarballs/
+open-agent-sdk*.tgz
+packages/cli/open-agent-sdk-cli*.tgz
+packages/core/open-agent-sdk*.tgz
 
 # Next.js build output
 .next/

diff --git a/benchmark/autoresearch/README.md b/benchmark/autoresearch/README.md
@@ -0,0 +1,71 @@
+# Autoresearch for Terminal-Bench
+
+This directory adapts the `program.md`-driven workflow popularized by
+Karpathy's `autoresearch` project to Open Agent SDK and Terminal-bench.
+
+## Design
+
+- The optimizing agent edits a narrow search surface:
+  - `packages/cli/src/index.ts`
+  - selected tool descriptions/formatters in `packages/core/src/tools/`
+  - `packages/core/src/agent/react-loop.ts`
+- The evaluator is fixed:
+  - `benchmark/autoresearch/evaluate.sh`
+  - Harbor + `terminal-bench@2.0`
+  - task list defaults to `benchmark/terminalbench/task-lists/smoke-5.txt`
+- Results are append-only in `benchmark/autoresearch/results.tsv`
+
+This is similar in spirit to
+https://github.com/karpathy/autoresearch:
+- immutable benchmark
+- narrow editable surface
+- repeated keep/revert loop
+- a Markdown `program.md` that acts like lightweight org code
+
+## Recommended Loop
+
+1. Read `program.md` and `scope.md`
+2. Make one small hypothesis-driven change
+3. Commit it
+4. Run:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>"
+```
+
+5. If you want automatic rollback on regressions:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh \
+  --tag "<short-label>" \
+  --revert-on-regress
+```
+
+The script will:
+- run `bun test`
+- run `evaluate.sh`
+- append a row to `results.tsv`
+- compare the latest row to the previous row
+- emit `KEEP` or `REVERT`
+- optionally `git reset --hard HEAD~1` while preserving the results log
+
+By default it also:
+- packs the current local SDK/CLI into tarballs
+- serves them over a temporary local HTTP server
+- exports `OAS_LOCAL_TARBALL_URL`
+
+This makes Harbor evaluate the latest local code even when task images are already
+pre-warmed. Disable this only if you intentionally want to benchmark the code
+already baked into the images:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh \
+  --tag "<short-label>" \
+  --no-local-tarballs
+```
+
+## Cost Control
+
+For Terminal-bench, benchmark cost matters more than in single-metric toy setups.
+Use the existing pre-warmed image path under `benchmark/terminalbench/` so
+experiments do not repeatedly reinstall Bun and the OAS CLI during agent setup.
diff --git a/benchmark/autoresearch/evaluate.sh b/benchmark/autoresearch/evaluate.sh
@@ -94,12 +94,71 @@ echo "=== autoresearch evaluate ==="
 echo "tasks=$TASK_COUNT  k=$K  model=$MODEL  tag=$TAG"
 echo ""
 
-# ── Helper: extract reward from harbor result.json ──
-# Harbor prints "Results written to <dir>/result.json" in stdout.
-# The run-level result.json contains stats.evals.*.metrics[0].mean
-# The trial-level result.json contains verifier_result.reward
-#
-# We parse the run-level result.json for the mean reward.
+# ── Helper: extract reward from a task-level Harbor result.json ──
+extract_reward_from_result_file() {
+  local result_file="$1"
+
+  if [ ! -f "$result_file" ]; then
+    echo "-1"
+    return
+  fi
+
+  python3 - "$result_file" <<'PY' 2>/dev/null
+import json
+import sys
+
+path = sys.argv[1]
+
+try:
+    with open(path) as f:
+        d = json.load(f)
+
+    vr = d.get("verifier_result") or {}
+    rewards = vr.get("rewards") or {}
+    reward = vr.get("reward", rewards.get("reward"))
+
+    if reward is not None:
+        print(int(float(reward) >= 0.5))
+    elif d.get("exception_info"):
+        print(-1)
+    else:
+        print(0)
+except Exception:
+    print(-1)
+PY
+}
+
+# ── Helper: find the newest task-level result.json produced after a marker ──
+find_latest_task_result() {
+  local task_name="$1"
+  local marker_file="$2"
+
+  python3 - "$REPO_ROOT" "$task_name" "$marker_file" <<'PY' 2>/dev/null
+import glob
+import os
+import sys
+
+repo_root, task_name, marker_file = sys.argv[1:]
+marker_mtime = os.path.getmtime(marker_file)
+
+pattern = os.path.join(repo_root, "jobs", "*", f"{task_name}__*", "result.json")
+candidates = []
+
+for path in glob.glob(pattern):
+    try:
+        mtime = os.path.getmtime(path)
+    except OSError:
+        continue
+    if mtime >= marker_mtime:
+        candidates.append((mtime, path))
+
+if candidates:
+    candidates.sort()
+    print(candidates[-1][1])
+PY
+}
+
+# ── Helper: fallback to parsing Harbor stdout when artifacts are unavailable ──
 extract_reward_from_output() {
   local run_output="$1"
 
@@ -117,32 +176,7 @@ extract_reward_from_output() {
   trial_result=$(find "$result_dir" -mindepth 2 -name "result.json" 2>/dev/null | head -1)
 
   if [ -n "$trial_result" ] && [ -f "$trial_result" ]; then
-    # Check verifier_result.reward in trial result
-    local reward
-    reward=$(python3 -c "
-import json, sys
-try:
-    d = json.load(open('$trial_result'))
-    vr = d.get('verifier_result') or {}
-    # Harbor stores reward in different formats:
-    #   verifier_result.reward (flat)
-    #   verifier_result.rewards.reward (nested)
-    r = vr.get('reward')
-    if r is None:
-        rewards = vr.get('rewards') or {}
-        r = rewards.get('reward')
-    if r is not None:
-        print(int(float(r) >= 0.5))
-        sys.exit(0)
-    # No verifier result — check if there was an exception
-    if d.get('exception_info'):
-        print(-1)
-    else:
-        print(0)
-except Exception:
-    print(-1)
-" 2>/dev/null)
-    echo "${reward:--1}"
+    extract_reward_from_result_file "$trial_result"
     return
   fi
 
@@ -179,6 +213,8 @@ except Exception:
 # ── Helper: run one trial, return 1=pass 0=fail -1=error ──
 run_single_trial() {
   local task_name="$1"
+  local marker_file
+  marker_file="$(mktemp)"
 
   # Build harbor command as array
   local -a cmd=(
@@ -236,9 +272,18 @@ run_single_trial() {
   rc=$?
   set -e
 
+  local latest_result reward
+  latest_result="$(find_latest_task_result "$task_name" "$marker_file")"
+  rm -f "$marker_file"
+
+  if [ -n "$latest_result" ]; then
+    reward="$(extract_reward_from_result_file "$latest_result")"
+    echo "${reward:--1}"
+    return
+  fi
+
   if [ "$rc" -ne 0 ]; then
     # Harbor exited non-zero, but still may have written result.json
-    local reward
     reward=$(extract_reward_from_output "$run_output")
     if [ "$reward" != "-1" ]; then
       echo "$reward"

diff --git a/benchmark/autoresearch/program.md b/benchmark/autoresearch/program.md
@@ -25,7 +25,7 @@ When k=1, all metrics collapse to the same number (simple pass rate).
 4. Create a git branch: `git checkout -b autoresearch/<your-tag>` (use a short, descriptive tag).
 5. Run a **baseline evaluation**:
    ```bash
-   ./benchmark/autoresearch/evaluate.sh -k 3 --tag baseline --output benchmark/autoresearch/results.tsv
+   bash ./benchmark/autoresearch/run-experiment.sh --tag baseline
    ```
 6. Review the baseline numbers before proceeding.
 
@@ -73,11 +73,17 @@ git commit -m "experiment: <description of what you changed and why>"
 
 ### Step 5: Evaluate
 
+Preferred:
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>"
+```
+
+Manual fallback:
 ```bash
 ./benchmark/autoresearch/evaluate.sh -k 3 --tag "<short-label>" --output benchmark/autoresearch/results.tsv
 ```
 
-Wait for it to complete. The script outputs pass@k, pass^k, and avg_trial_rate.
+Wait for it to complete. The scripts output pass@k, pass^k, and avg_trial_rate.
 
 ### Step 6: Analyze & Decide
 
@@ -92,11 +98,16 @@ Read the results and compare against the previous baseline in `results.tsv`.
 - pass@k decreased (lost capability)
 - pass^k decreased AND pass@k didn't improve (net negative)
 
-To revert:
+To revert manually:
 ```bash
 git reset --hard HEAD~1
 ```
 
+If you use the helper script below, it can decide and revert automatically:
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>" --revert-on-regress
+```
+
 Record failed experiments in `results.tsv` anyway — append `[REVERTED]` to the description.
 
 **Prioritization:**