From 45728c5bbab6eae61a08ecbc0ed385e10a631d28 Mon Sep 17 00:00:00 2001
From: octane0411 <wdznb1@gmail.com>
Date: Tue, 24 Mar 2026 00:23:27 +0800
Subject: [PATCH 1/4] feat(benchmark): add terminalbench prewarm workflow

---
 .gitignore                                    |   4 +
 benchmark/terminalbench/README.md             |  19 ++
 .../open_agent_sdk_harbor/agent.py            |   3 +
 .../install-open-agent-sdk.sh.j2              |  41 ++-
 benchmark/terminalbench/prewarm-images.sh     | 287 ++++++++++++++++++
 .../scripts/pack-local-tarballs.sh            |  70 +++++
 6 files changed, 421 insertions(+), 3 deletions(-)
 create mode 100755 benchmark/terminalbench/prewarm-images.sh
 create mode 100755 benchmark/terminalbench/scripts/pack-local-tarballs.sh

diff --git a/.gitignore b/.gitignore
index 82761a4..d21422e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,10 @@ RELEASE_NOTES.md
 jobs/
 trajectories/
 run-bench-test.sh
+benchmark/terminalbench/.local-tarballs/
+open-agent-sdk*.tgz
+packages/cli/open-agent-sdk-cli*.tgz
+packages/core/open-agent-sdk*.tgz
 
 # Next.js build output
 .next/
diff --git a/benchmark/terminalbench/README.md b/benchmark/terminalbench/README.md
index 4a6f1b0..1c8c29a 100644
--- a/benchmark/terminalbench/README.md
+++ b/benchmark/terminalbench/README.md
@@ -78,6 +78,25 @@ Scripts:
 
 - `benchmark/terminalbench/scripts/run-terminalbench-overnight.sh`
 - `benchmark/terminalbench/scripts/cleanup-terminalbench-images.sh`
+- `benchmark/terminalbench/scripts/pack-local-tarballs.sh`
+
+To pre-install the local SDK/CLI build into cached task images and avoid repeated
+registry installs on every trial:
+
+```bash
+./benchmark/terminalbench/prewarm-images.sh \
+  --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt \
+  --pack-local-tarballs
+```
+
+Notes:
+
+- This builds fresh local tarballs, serves them temporarily on `host.docker.internal`,
+  and bakes `bun`, `oas`, `uv`, and `pytest` into each task image.
+- Override `--tarball-host` or `--tarball-port` if your Docker runtime cannot reach
+  `host.docker.internal:8765`.
+- After pre-warm, normal Harbor runs reuse the pre-installed `oas` fast path and do
+  not need `OAS_LOCAL_TARBALL_URL`.
 
 The overnight runner always sources **main workspace** `.env` (via git common dir), so it works correctly even when executed from a git worktree.
 
diff --git a/benchmark/terminalbench/open_agent_sdk_harbor/agent.py b/benchmark/terminalbench/open_agent_sdk_harbor/agent.py
index 7ac2d49..5e33bc4 100644
--- a/benchmark/terminalbench/open_agent_sdk_harbor/agent.py
+++ b/benchmark/terminalbench/open_agent_sdk_harbor/agent.py
@@ -120,6 +120,9 @@ def _install_agent_template_path(self) -> Path:
     def _setup_env(self) -> dict[str, str]:
         """Pass mirror/local-install env vars to install script."""
         env = super()._setup_env()
+        version = self.version()
+        if version:
+            env["OAS_PACKAGE_VERSION"] = version
         for key in ("OAS_GITHUB_MIRROR", "OAS_NPM_REGISTRIES", "OAS_LOCAL_TARBALL_URL"):
             val = os.environ.get(key)
             if val:
diff --git a/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2 b/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2
index f9d3fe0..1950f4f 100644
--- a/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2
+++ b/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2
@@ -1,6 +1,15 @@
 #!/bin/bash
 set -euo pipefail
 
+# Fast path: skip install if oas CLI is already available (pre-warmed image)
+# unless this run explicitly requests a fresh local tarball install.
+if [ -z "${OAS_LOCAL_TARBALL_URL:-}" ] && command -v oas &>/dev/null && command -v bun &>/dev/null; then
+    echo "oas CLI already installed (pre-warmed image), skipping setup"
+    echo "Bun ready: $(bun --version)"
+    echo "CLI ready: $(which oas)"
+    exit 0
+fi
+
 retry() {
     local attempts="$1"
     local sleep_sec="$2"
@@ -18,6 +27,26 @@ retry() {
     done
 }
 
+download_tarball_with_fallback() {
+    local dest="$1"
+    shift
+    local tarball_url_base="${OAS_LOCAL_TARBALL_URL%/}"
+
+    for name in "$@"; do
+        [ -z "$name" ] && continue
+        local url="${tarball_url_base}/${name}"
+        echo "Fetching local tarball: ${url}"
+        if curl -fsSL "$url" -o "$dest"; then
+            echo "Downloaded ${name}"
+            return 0
+        fi
+        echo "Missing ${name}, trying next candidate..."
+    done
+
+    echo "ERROR: failed to download tarball from ${tarball_url_base}" >&2
+    return 1
+}
+
 install_cli_with_registry_fallback() {
     # Comma-separated registries, can be overridden by environment.
     # Example:
@@ -65,8 +94,13 @@ export PATH="$HOME/.bun/bin:$PATH"
 if [ -n "${OAS_LOCAL_TARBALL_URL:-}" ]; then
     echo "Installing from local tarballs at ${OAS_LOCAL_TARBALL_URL}"
     mkdir -p /tmp/oas-local && cd /tmp/oas-local
-    curl -fsSL "${OAS_LOCAL_TARBALL_URL}/open-agent-sdk-0.1.0-alpha.1.tgz" -o sdk.tgz
-    curl -fsSL "${OAS_LOCAL_TARBALL_URL}/open-agent-sdk-cli-0.1.0-alpha.1.tgz" -o cli.tgz
+    package_version="${OAS_PACKAGE_VERSION:-0.1.0-alpha.1}"
+    download_tarball_with_fallback sdk.tgz \
+        "open-agent-sdk.tgz" \
+        "open-agent-sdk-${package_version}.tgz"
+    download_tarball_with_fallback cli.tgz \
+        "open-agent-sdk-cli.tgz" \
+        "open-agent-sdk-cli-${package_version}.tgz"
     mkdir -p sdk cli
     tar xzf sdk.tgz -C sdk --strip-components=1
     tar xzf cli.tgz -C cli --strip-components=1
@@ -94,4 +128,5 @@ fi
 
 echo "Bun ready: $(bun --version)"
 echo "CLI ready: $(which oas)"
-echo "CLI version: $(oas --version 2>&1 || echo 'version check failed')"
+help_output="$(oas --help 2>&1 || true)"
+echo "CLI help: $(printf '%s\n' "$help_output" | sed -n '1p')"
diff --git a/benchmark/terminalbench/prewarm-images.sh b/benchmark/terminalbench/prewarm-images.sh
new file mode 100755
index 0000000..d5cdecb
--- /dev/null
+++ b/benchmark/terminalbench/prewarm-images.sh
@@ -0,0 +1,287 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+#
+# prewarm-images.sh — Pre-install bun + oas CLI + pytest into task Docker images
+#
+# Eliminates repeated ~100s setup overhead per task by baking dependencies
+# into the task Docker images. The original images are backed up as
+# oas-original/<name>:<tag> and the pre-warmed images replace the originals,
+# so Harbor uses them transparently without any config changes.
+#
+# Usage:
+#   ./benchmark/terminalbench/prewarm-images.sh
+#   ./benchmark/terminalbench/prewarm-images.sh --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt
+#   ./benchmark/terminalbench/prewarm-images.sh --all
+#   ./benchmark/terminalbench/prewarm-images.sh --force
+#   ./benchmark/terminalbench/prewarm-images.sh --restore   # restore original images
+#
+
+REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+TASKS_FILE="${REPO_ROOT}/benchmark/terminalbench/task-lists/smoke-5.txt"
+PREWARM_ALL=false
+FORCE=false
+RESTORE=false
+BACKUP_PREFIX="oas-original"
+PYPI_MIRROR="https://pypi.tuna.tsinghua.edu.cn/simple"
+PACK_LOCAL_TARBALLS=false
+TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs"
+TARBALL_PORT="8765"
+TARBALL_HOST="host.docker.internal"
+PACKAGE_VERSION=""
+SERVER_LOG=""
+TARBALL_SERVER_PID=""
+DOCKER_ADD_HOST_ARGS=()
+
+usage() {
+  cat <<'EOF'
+Usage: prewarm-images.sh [options]
+
+Options:
+  --tasks-file FILE    Task list file (default: smoke-5.txt)
+  --all                Pre-warm ALL cached task images
+  --force              Force rebuild even if already pre-warmed
+  --restore            Restore original images from backup
+  --pypi-mirror URL    PyPI mirror for pytest install (default: tsinghua)
+  --pack-local-tarballs
+                     Build repo-local SDK/CLI tarballs and serve them temporarily
+  --tarball-dir DIR    Directory used for generated local tarballs
+  --tarball-port N     HTTP port for temporary tarball server (default: 8765)
+  --tarball-host HOST  Hostname containers should use for tarball server
+                       (default: host.docker.internal)
+  -h, --help           Show help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --tasks-file) TASKS_FILE="${2:-}"; shift 2 ;;
+    --all) PREWARM_ALL=true; shift ;;
+    --force) FORCE=true; shift ;;
+    --restore) RESTORE=true; shift ;;
+    --pypi-mirror) PYPI_MIRROR="${2:-}"; shift 2 ;;
+    --pack-local-tarballs) PACK_LOCAL_TARBALLS=true; shift ;;
+    --tarball-dir) TARBALL_DIR="${2:-}"; shift 2 ;;
+    --tarball-port) TARBALL_PORT="${2:-}"; shift 2 ;;
+    --tarball-host) TARBALL_HOST="${2:-}"; shift 2 ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage; exit 1 ;;
+  esac
+done
+
+cleanup() {
+  rm -f "$TASK_MAP"
+  if [ -n "${TARBALL_SERVER_PID:-}" ]; then
+    kill "$TARBALL_SERVER_PID" >/dev/null 2>&1 || true
+    wait "$TARBALL_SERVER_PID" 2>/dev/null || true
+  fi
+  if [ -n "${SERVER_LOG:-}" ]; then
+    rm -f "$SERVER_LOG"
+  fi
+}
+
+# Load env
+MAIN_GIT_DIR="$(git -C "$REPO_ROOT" rev-parse --git-common-dir)"
+MAIN_ENV_FILE="$(cd "$MAIN_GIT_DIR/.." && pwd)/.env"
+if [ -f "$MAIN_ENV_FILE" ]; then
+  set -a; source "$MAIN_ENV_FILE"; set +a
+fi
+
+# Collect unique docker images from tasks
+TASK_MAP=$(mktemp)
+trap cleanup EXIT
+
+if [ "$PREWARM_ALL" = true ]; then
+  while IFS= read -r toml; do
+    image=$(grep 'docker_image' "$toml" 2>/dev/null | head -1 | sed 's/.*= *"//;s/"//')
+    [ -z "$image" ] && continue
+    task_name=$(basename "$(dirname "$toml")")
+    printf '%s\t%s\n' "$task_name" "$image" >> "$TASK_MAP"
+  done < <(find ~/.cache/harbor/tasks/ -name "task.toml" 2>/dev/null)
+else
+  if [ ! -f "$TASKS_FILE" ]; then
+    echo "Tasks file not found: $TASKS_FILE" >&2
+    exit 1
+  fi
+  while IFS= read -r task_name; do
+    [ -z "$task_name" ] && continue
+    [[ "$task_name" == \#* ]] && continue
+    toml=$(find ~/.cache/harbor/tasks/ -path "*/$task_name/task.toml" 2>/dev/null | head -1)
+    if [ -z "$toml" ]; then
+      echo "WARN: task '$task_name' not found in cache, skipping"
+      continue
+    fi
+    image=$(grep -E '^docker_image|^# original_docker_image' "$toml" | head -1 | sed 's/.*= *"//;s/"//')
+    printf '%s\t%s\n' "$task_name" "$image" >> "$TASK_MAP"
+  done < "$TASKS_FILE"
+fi
+
+UNIQUE_IMAGES=$(cut -f2 "$TASK_MAP" | sort -u)
+UNIQUE_COUNT=$(echo "$UNIQUE_IMAGES" | grep -c . || true)
+TASK_COUNT=$(wc -l < "$TASK_MAP" | tr -d ' ')
+
+# --restore: swap back original images
+if [ "$RESTORE" = true ]; then
+  echo "=== Restoring original images ==="
+  RESTORED=0
+  while IFS= read -r image; do
+    [ -z "$image" ] && continue
+    image_base=$(echo "$image" | sed 's|.*/||')
+    backup="${BACKUP_PREFIX}/${image_base}"
+    if docker image inspect "$backup" &>/dev/null; then
+      docker tag "$backup" "$image"
+      echo "  [OK] $backup -> $image"
+      RESTORED=$((RESTORED + 1))
+    fi
+  done <<< "$UNIQUE_IMAGES"
+  echo "Restored: $RESTORED"
+  exit 0
+fi
+
+echo "=== Pre-warm Docker images ==="
+echo "Tasks: $TASK_COUNT  Unique images: $UNIQUE_COUNT"
+echo ""
+
+if [ "$PACK_LOCAL_TARBALLS" = true ]; then
+  PACK_SCRIPT="${REPO_ROOT}/benchmark/terminalbench/scripts/pack-local-tarballs.sh"
+  if [ ! -f "$PACK_SCRIPT" ]; then
+    echo "ERROR: pack script not found: $PACK_SCRIPT" >&2
+    exit 1
+  fi
+  if ! command -v python3 >/dev/null 2>&1; then
+    echo "ERROR: python3 is required for --pack-local-tarballs" >&2
+    exit 1
+  fi
+
+  echo "=== Preparing local tarballs ==="
+  bash "$PACK_SCRIPT" --output-dir "$TARBALL_DIR"
+
+  SERVER_LOG=$(mktemp)
+  python3 -m http.server "$TARBALL_PORT" --bind 0.0.0.0 --directory "$TARBALL_DIR" \
+    >"$SERVER_LOG" 2>&1 &
+  TARBALL_SERVER_PID=$!
+  sleep 1
+  if ! kill -0 "$TARBALL_SERVER_PID" >/dev/null 2>&1; then
+    echo "ERROR: failed to start local tarball server on port $TARBALL_PORT" >&2
+    cat "$SERVER_LOG" >&2 || true
+    exit 1
+  fi
+
+  OAS_LOCAL_TARBALL_URL="http://${TARBALL_HOST}:${TARBALL_PORT}"
+  export OAS_LOCAL_TARBALL_URL
+  DOCKER_ADD_HOST_ARGS=(--add-host "${TARBALL_HOST}:host-gateway")
+  echo "Serving local tarballs from: $OAS_LOCAL_TARBALL_URL"
+  echo ""
+fi
+
+if command -v python3 >/dev/null 2>&1; then
+  PACKAGE_VERSION="$(python3 -c 'import json, sys; print(json.load(open(sys.argv[1]))["version"])' "${REPO_ROOT}/package.json")"
+fi
+
+INSTALL_SCRIPT="${REPO_ROOT}/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2"
+if [ ! -f "$INSTALL_SCRIPT" ]; then
+  echo "ERROR: install script not found: $INSTALL_SCRIPT" >&2
+  exit 1
+fi
+
+WARMED=0
+SKIPPED=0
+FAILED=0
+
+while IFS= read -r image; do
+  [ -z "$image" ] && continue
+
+  image_base=$(echo "$image" | sed 's|.*/||')
+  backup="${BACKUP_PREFIX}/${image_base}"
+  tasks=$(awk -F'\t' -v img="$image" '$2==img {printf "%s ", $1}' "$TASK_MAP")
+
+  # Check if already pre-warmed (backup exists = already done)
+  if [ "$FORCE" != true ] && docker image inspect "$backup" &>/dev/null; then
+    echo "[SKIP] $image (already pre-warmed, use --force to rebuild)"
+    SKIPPED=$((SKIPPED + 1))
+    continue
+  fi
+
+  echo "[BUILD] $image"
+  echo "  tasks: $tasks"
+
+  # Pull original if needed
+  if ! docker image inspect "$image" &>/dev/null; then
+    echo "  pulling $image ..."
+    if ! docker pull "$image"; then
+      echo "  FAIL: pull failed"
+      FAILED=$((FAILED + 1))
+      continue
+    fi
+  fi
+
+  if docker image inspect "$backup" &>/dev/null; then
+    echo "  restoring original image from backup before rebuild"
+    docker tag "$backup" "$image"
+  else
+    # Backup original image on first pre-warm
+    docker tag "$image" "$backup"
+  fi
+
+  container_name="oas-prewarm-$$"
+
+  # Build combined setup script
+  SETUP_SCRIPT=$(mktemp)
+  cat > "$SETUP_SCRIPT" << 'SETUP_HEADER'
+#!/bin/bash
+set -euo pipefail
+SETUP_HEADER
+
+  # Append agent install script (strip jinja)
+  sed 's/{%.*%}//g; s/{{.*}}//g' "$INSTALL_SCRIPT" >> "$SETUP_SCRIPT"
+
+  # Append pytest pre-install
+  cat >> "$SETUP_SCRIPT" << SETUP_FOOTER
+
+# Pre-install uv + pytest for verifier
+echo "=== Pre-installing uv + pytest ==="
+if ! command -v curl &>/dev/null; then
+  apt-get update -qq && apt-get install -y -qq curl >/dev/null 2>&1 || true
+fi
+curl -LsSf https://astral.sh/uv/0.9.5/install.sh | sh
+export PATH="\$HOME/.local/bin:\$PATH"
+UV_INDEX_URL="${PYPI_MIRROR}" UV_HTTP_TIMEOUT=300 uvx \\
+  -p 3.13 \\
+  -w pytest==8.4.1 \\
+  -w pytest-json-ctrf==0.3.5 \\
+  pytest --version
+echo "=== Pre-warm complete ==="
+SETUP_FOOTER
+
+  # Run in container
+  if docker run --name "$container_name" \
+    "${DOCKER_ADD_HOST_ARGS[@]}" \
+    -e "OAS_GITHUB_MIRROR=${OAS_GITHUB_MIRROR:-}" \
+    -e "OAS_NPM_REGISTRIES=${OAS_NPM_REGISTRIES:-}" \
+    -e "OAS_LOCAL_TARBALL_URL=${OAS_LOCAL_TARBALL_URL:-}" \
+    -e "OAS_PACKAGE_VERSION=${PACKAGE_VERSION}" \
+    "$image" \
+    bash -c "$(cat "$SETUP_SCRIPT")" 2>&1 | tail -5; then
+
+    # Replace original image with pre-warmed version
+    docker commit "$container_name" "$image" > /dev/null
+    docker rm -f "$container_name" > /dev/null
+    echo "  OK: pre-warmed (backup: $backup)"
+    WARMED=$((WARMED + 1))
+  else
+    echo "  FAIL: setup exited with error"
+    # Restore original from backup
+    docker tag "$backup" "$image"
+    docker rm -f "$container_name" &>/dev/null || true
+    FAILED=$((FAILED + 1))
+  fi
+  rm -f "$SETUP_SCRIPT"
+
+done <<< "$UNIQUE_IMAGES"
+
+echo ""
+echo "=== Summary ==="
+echo "  Warmed: $WARMED  Skipped: $SKIPPED  Failed: $FAILED"
+echo ""
+echo "Done. Pre-warmed images replace originals — no config changes needed."
+echo "Use --restore to revert to original images."
diff --git a/benchmark/terminalbench/scripts/pack-local-tarballs.sh b/benchmark/terminalbench/scripts/pack-local-tarballs.sh
new file mode 100755
index 0000000..eb1dfeb
--- /dev/null
+++ b/benchmark/terminalbench/scripts/pack-local-tarballs.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+OUTPUT_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs"
+SKIP_BUILD=false
+NPM_CACHE_DIR=""
+
+usage() {
+  cat <<'EOF'
+Usage: pack-local-tarballs.sh [options]
+
+Options:
+  --output-dir DIR    Directory to write tarballs into
+  --skip-build        Reuse existing packages/core/dist without rebuilding
+  -h, --help          Show help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --output-dir) OUTPUT_DIR="${2:-}"; shift 2 ;;
+    --skip-build) SKIP_BUILD=true; shift ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage; exit 1 ;;
+  esac
+done
+
+if ! command -v bun >/dev/null 2>&1; then
+  echo "bun not found in PATH" >&2
+  exit 1
+fi
+
+if ! command -v npm >/dev/null 2>&1; then
+  echo "npm not found in PATH" >&2
+  exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+rm -f "$OUTPUT_DIR"/open-agent-sdk*.tgz "$OUTPUT_DIR"/open-agent-sdk-cli*.tgz
+NPM_CACHE_DIR="${OUTPUT_DIR}/.npm-cache"
+mkdir -p "$NPM_CACHE_DIR"
+
+if [ "$SKIP_BUILD" != true ]; then
+  echo "=== Building core package ==="
+  (
+    cd "$REPO_ROOT/packages/core"
+    bun run build
+  )
+fi
+
+echo "=== Packing local tarballs ==="
+CORE_TARBALL="$(
+  cd "$REPO_ROOT/packages/core"
+  NPM_CONFIG_CACHE="$NPM_CACHE_DIR" npm pack --silent --pack-destination "$OUTPUT_DIR"
+)"
+CLI_TARBALL="$(
+  cd "$REPO_ROOT/packages/cli"
+  NPM_CONFIG_CACHE="$NPM_CACHE_DIR" npm pack --silent --pack-destination "$OUTPUT_DIR"
+)"
+
+cp "$OUTPUT_DIR/$CORE_TARBALL" "$OUTPUT_DIR/open-agent-sdk.tgz"
+cp "$OUTPUT_DIR/$CLI_TARBALL" "$OUTPUT_DIR/open-agent-sdk-cli.tgz"
+
+echo "Output dir: $OUTPUT_DIR"
+echo "Core tarball: $CORE_TARBALL"
+echo "CLI tarball: $CLI_TARBALL"
+echo "Stable aliases:"
+echo "  - open-agent-sdk.tgz"
+echo "  - open-agent-sdk-cli.tgz"

From b923d8e3cdbbeba4cdbd06ca4a587162921673f5 Mon Sep 17 00:00:00 2001
From: octane0411 <wdznb1@gmail.com>
Date: Tue, 24 Mar 2026 00:23:46 +0800
Subject: [PATCH 2/4] feat(autoresearch): add smoke-5 experiment runner

---
 benchmark/autoresearch/README.md         |  71 +++++
 benchmark/autoresearch/evaluate.sh       | 111 +++++---
 benchmark/autoresearch/program.md        |  17 +-
 benchmark/autoresearch/run-experiment.sh | 316 +++++++++++++++++++++++
 4 files changed, 479 insertions(+), 36 deletions(-)
 create mode 100644 benchmark/autoresearch/README.md
 create mode 100755 benchmark/autoresearch/run-experiment.sh

diff --git a/benchmark/autoresearch/README.md b/benchmark/autoresearch/README.md
new file mode 100644
index 0000000..da101c7
--- /dev/null
+++ b/benchmark/autoresearch/README.md
@@ -0,0 +1,71 @@
+# Autoresearch for Terminal-Bench
+
+This directory adapts the `program.md`-driven workflow popularized by
+Karpathy's `autoresearch` project to Open Agent SDK and Terminal-bench.
+
+## Design
+
+- The optimizing agent edits a narrow search surface:
+  - `packages/cli/src/index.ts`
+  - selected tool descriptions/formatters in `packages/core/src/tools/`
+  - `packages/core/src/agent/react-loop.ts`
+- The evaluator is fixed:
+  - `benchmark/autoresearch/evaluate.sh`
+  - Harbor + `terminal-bench@2.0`
+  - task list defaults to `benchmark/terminalbench/task-lists/smoke-5.txt`
+- Results are append-only in `benchmark/autoresearch/results.tsv`
+
+This is similar in spirit to
+https://github.com/karpathy/autoresearch:
+- immutable benchmark
+- narrow editable surface
+- repeated keep/revert loop
+- a Markdown `program.md` that acts like lightweight org code
+
+## Recommended Loop
+
+1. Read `program.md` and `scope.md`
+2. Make one small hypothesis-driven change
+3. Commit it
+4. Run:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>"
+```
+
+5. If you want automatic rollback on regressions:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh \
+  --tag "<short-label>" \
+  --revert-on-regress
+```
+
+The script will:
+- run `bun test`
+- run `evaluate.sh`
+- append a row to `results.tsv`
+- compare the latest row to the previous row
+- emit `KEEP` or `REVERT`
+- optionally `git reset --hard HEAD~1` while preserving the results log
+
+By default it also:
+- packs the current local SDK/CLI into tarballs
+- serves them over a temporary local HTTP server
+- exports `OAS_LOCAL_TARBALL_URL`
+
+This makes Harbor evaluate the latest local code even when task images are already
+pre-warmed. Disable this only if you intentionally want to benchmark the code
+already baked into the images:
+
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh \
+  --tag "<short-label>" \
+  --no-local-tarballs
+```
+
+## Cost Control
+
+For Terminal-bench, benchmark cost matters more than in single-metric toy setups.
+Use the existing pre-warmed image path under `benchmark/terminalbench/` so
+experiments do not repeatedly reinstall Bun and the OAS CLI during agent setup.
diff --git a/benchmark/autoresearch/evaluate.sh b/benchmark/autoresearch/evaluate.sh
index 4bb30ea..acb26b2 100755
--- a/benchmark/autoresearch/evaluate.sh
+++ b/benchmark/autoresearch/evaluate.sh
@@ -94,12 +94,71 @@ echo "=== autoresearch evaluate ==="
 echo "tasks=$TASK_COUNT  k=$K  model=$MODEL  tag=$TAG"
 echo ""
 
-# ── Helper: extract reward from harbor result.json ──
-# Harbor prints "Results written to <dir>/result.json" in stdout.
-# The run-level result.json contains stats.evals.*.metrics[0].mean
-# The trial-level result.json contains verifier_result.reward
-#
-# We parse the run-level result.json for the mean reward.
+# ── Helper: extract reward from a task-level Harbor result.json ──
+extract_reward_from_result_file() {
+  local result_file="$1"
+
+  if [ ! -f "$result_file" ]; then
+    echo "-1"
+    return
+  fi
+
+  python3 - "$result_file" <<'PY' 2>/dev/null
+import json
+import sys
+
+path = sys.argv[1]
+
+try:
+    with open(path) as f:
+        d = json.load(f)
+
+    vr = d.get("verifier_result") or {}
+    rewards = vr.get("rewards") or {}
+    reward = vr.get("reward", rewards.get("reward"))
+
+    if reward is not None:
+        print(int(float(reward) >= 0.5))
+    elif d.get("exception_info"):
+        print(-1)
+    else:
+        print(0)
+except Exception:
+    print(-1)
+PY
+}
+
+# ── Helper: find the newest task-level result.json produced after a marker ──
+find_latest_task_result() {
+  local task_name="$1"
+  local marker_file="$2"
+
+  python3 - "$REPO_ROOT" "$task_name" "$marker_file" <<'PY' 2>/dev/null
+import glob
+import os
+import sys
+
+repo_root, task_name, marker_file = sys.argv[1:]
+marker_mtime = os.path.getmtime(marker_file)
+
+pattern = os.path.join(repo_root, "jobs", "*", f"{task_name}__*", "result.json")
+candidates = []
+
+for path in glob.glob(pattern):
+    try:
+        mtime = os.path.getmtime(path)
+    except OSError:
+        continue
+    if mtime >= marker_mtime:
+        candidates.append((mtime, path))
+
+if candidates:
+    candidates.sort()
+    print(candidates[-1][1])
+PY
+}
+
+# ── Helper: fallback to parsing Harbor stdout when artifacts are unavailable ──
 extract_reward_from_output() {
   local run_output="$1"
 
@@ -117,32 +176,7 @@ extract_reward_from_output() {
   trial_result=$(find "$result_dir" -mindepth 2 -name "result.json" 2>/dev/null | head -1)
 
   if [ -n "$trial_result" ] && [ -f "$trial_result" ]; then
-    # Check verifier_result.reward in trial result
-    local reward
-    reward=$(python3 -c "
-import json, sys
-try:
-    d = json.load(open('$trial_result'))
-    vr = d.get('verifier_result') or {}
-    # Harbor stores reward in different formats:
-    #   verifier_result.reward (flat)
-    #   verifier_result.rewards.reward (nested)
-    r = vr.get('reward')
-    if r is None:
-        rewards = vr.get('rewards') or {}
-        r = rewards.get('reward')
-    if r is not None:
-        print(int(float(r) >= 0.5))
-        sys.exit(0)
-    # No verifier result — check if there was an exception
-    if d.get('exception_info'):
-        print(-1)
-    else:
-        print(0)
-except Exception:
-    print(-1)
-" 2>/dev/null)
-    echo "${reward:--1}"
+    extract_reward_from_result_file "$trial_result"
     return
   fi
 
@@ -179,6 +213,8 @@ except Exception:
 # ── Helper: run one trial, return 1=pass 0=fail -1=error ──
 run_single_trial() {
   local task_name="$1"
+  local marker_file
+  marker_file="$(mktemp)"
 
   # Build harbor command as array
   local -a cmd=(
@@ -236,9 +272,18 @@ run_single_trial() {
   rc=$?
   set -e
 
+  local latest_result reward
+  latest_result="$(find_latest_task_result "$task_name" "$marker_file")"
+  rm -f "$marker_file"
+
+  if [ -n "$latest_result" ]; then
+    reward="$(extract_reward_from_result_file "$latest_result")"
+    echo "${reward:--1}"
+    return
+  fi
+
   if [ "$rc" -ne 0 ]; then
     # Harbor exited non-zero, but still may have written result.json
-    local reward
     reward=$(extract_reward_from_output "$run_output")
     if [ "$reward" != "-1" ]; then
       echo "$reward"
diff --git a/benchmark/autoresearch/program.md b/benchmark/autoresearch/program.md
index 5107a60..c2a9869 100644
--- a/benchmark/autoresearch/program.md
+++ b/benchmark/autoresearch/program.md
@@ -25,7 +25,7 @@ When k=1, all metrics collapse to the same number (simple pass rate).
 4. Create a git branch: `git checkout -b autoresearch/<your-tag>` (use a short, descriptive tag).
 5. Run a **baseline evaluation**:
    ```bash
-   ./benchmark/autoresearch/evaluate.sh -k 3 --tag baseline --output benchmark/autoresearch/results.tsv
+   bash ./benchmark/autoresearch/run-experiment.sh --tag baseline
    ```
 6. Review the baseline numbers before proceeding.
 
@@ -73,11 +73,17 @@ git commit -m "experiment: <description of what you changed and why>"
 
 ### Step 5: Evaluate
 
+Preferred:
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>"
+```
+
+Manual fallback:
 ```bash
 ./benchmark/autoresearch/evaluate.sh -k 3 --tag "<short-label>" --output benchmark/autoresearch/results.tsv
 ```
 
-Wait for it to complete. The script outputs pass@k, pass^k, and avg_trial_rate.
+Wait for it to complete. The scripts output pass@k, pass^k, and avg_trial_rate.
 
 ### Step 6: Analyze & Decide
 
@@ -92,11 +98,16 @@ Read the results and compare against the previous baseline in `results.tsv`.
 - pass@k decreased (lost capability)
 - pass^k decreased AND pass@k didn't improve (net negative)
 
-To revert:
+To revert manually:
 ```bash
 git reset --hard HEAD~1
 ```
 
+If you use the helper script below, it can decide and revert automatically:
+```bash
+bash ./benchmark/autoresearch/run-experiment.sh --tag "<short-label>" --revert-on-regress
+```
+
 Record failed experiments in `results.tsv` anyway — append `[REVERTED]` to the description.
 
 **Prioritization:**
diff --git a/benchmark/autoresearch/run-experiment.sh b/benchmark/autoresearch/run-experiment.sh
new file mode 100755
index 0000000..bd6f2a8
--- /dev/null
+++ b/benchmark/autoresearch/run-experiment.sh
@@ -0,0 +1,316 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+OUTPUT="${REPO_ROOT}/benchmark/autoresearch/results.tsv"
+TASKS_FILE="${REPO_ROOT}/benchmark/terminalbench/task-lists/smoke-5.txt"
+MODEL="MiniMax-M2.5"
+K=3
+TAG=""
+SLEEP_BETWEEN=3
+SKIP_TESTS=false
+REVERT_ON_REGRESS=false
+FULL_TESTS=false
+TEST_CMD=(
+  "bun" "test"
+  "packages/core/tests/agent/react-loop.test.ts"
+  "packages/core/tests/agent/react-loop-system-prompt.test.ts"
+  "packages/core/tests/agent/compact.test.ts"
+  "packages/core/tests/agent/compact-auto-trigger.test.ts"
+  "packages/core/tests/tools/bash.test.ts"
+  "packages/core/tests/tools/read.test.ts"
+  "packages/core/tests/tools/write.test.ts"
+  "packages/core/tests/tools/edit.test.ts"
+  "packages/core/tests/tools/glob.test.ts"
+  "packages/core/tests/tools/grep.test.ts"
+  "packages/core/tests/integration.test.ts"
+)
+OUTPUT_REL=""
+USE_LOCAL_TARBALLS=true
+TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs"
+TARBALL_PORT="8765"
+TARBALL_HOST="host.docker.internal"
+TARBALL_SERVER_PID=""
+TARBALL_SERVER_LOG=""
+
+usage() {
+  cat <<'EOF'
+Usage: run-experiment.sh --tag <label> [options]
+
+Runs the standard autoresearch iteration:
+1. Run tests
+2. Run terminal-bench evaluation
+3. Compare the latest row in results.tsv to the previous row
+4. Print KEEP/REVERT decision
+5. Optionally reset the latest code commit while preserving results.tsv
+
+Options:
+  --tag LABEL             Required label written into results.tsv
+  --tasks-file FILE       Task list passed to evaluate.sh
+  --model MODEL           Model name passed to evaluate.sh
+  -k N                    Trials per task (default: 3)
+  --sleep N               Sleep between trials (default: 3)
+  --output FILE           results.tsv path (default: benchmark/autoresearch/results.tsv)
+  --no-local-tarballs     Use whatever is already installed in task images
+  --tarball-dir DIR       Directory for generated local tarballs
+  --tarball-port N        HTTP port for temporary local tarball server
+  --tarball-host HOST     Hostname containers should use for tarball server
+  --full-tests            Run full `bun test` instead of the targeted autoresearch gate
+  --skip-tests            Skip `bun test`
+  --revert-on-regress     If decision is REVERT, reset HEAD~1 and restore results.tsv
+  -h, --help              Show help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --tag) TAG="${2:-}"; shift 2 ;;
+    --tasks-file) TASKS_FILE="${2:-}"; shift 2 ;;
+    --model) MODEL="${2:-}"; shift 2 ;;
+    -k) K="${2:-}"; shift 2 ;;
+    --sleep) SLEEP_BETWEEN="${2:-}"; shift 2 ;;
+    --output) OUTPUT="${2:-}"; shift 2 ;;
+    --no-local-tarballs) USE_LOCAL_TARBALLS=false; shift ;;
+    --tarball-dir) TARBALL_DIR="${2:-}"; shift 2 ;;
+    --tarball-port) TARBALL_PORT="${2:-}"; shift 2 ;;
+    --tarball-host) TARBALL_HOST="${2:-}"; shift 2 ;;
+    --full-tests) FULL_TESTS=true; shift ;;
+    --skip-tests) SKIP_TESTS=true; shift ;;
+    --revert-on-regress) REVERT_ON_REGRESS=true; shift ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage; exit 1 ;;
+  esac
+done
+
+cleanup() {
+  if [ -n "${TARBALL_SERVER_PID:-}" ]; then
+    kill "$TARBALL_SERVER_PID" >/dev/null 2>&1 || true
+    wait "$TARBALL_SERVER_PID" 2>/dev/null || true
+  fi
+  if [ -n "${TARBALL_SERVER_LOG:-}" ]; then
+    rm -f "$TARBALL_SERVER_LOG"
+  fi
+}
+
+trap cleanup EXIT
+
+ensure_harbor_registration() {
+  local agents_dir
+  agents_dir="$(python3 - <<'PY'
+import harbor
+from pathlib import Path
+print(Path(harbor.__path__[0]) / "agents" / "installed")
+PY
+)"
+
+  ln -sf "${REPO_ROOT}/benchmark/terminalbench/open_agent_sdk_harbor/agent.py" \
+    "${agents_dir}/open_agent_sdk.py"
+  ln -sf "${REPO_ROOT}/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2" \
+    "${agents_dir}/install-open-agent-sdk.sh.j2"
+
+  echo "Harbor agent registered from current repo:"
+  echo "  ${agents_dir}/open_agent_sdk.py"
+  echo "  ${agents_dir}/install-open-agent-sdk.sh.j2"
+  echo ""
+}
+
+if [ -z "$TAG" ]; then
+  echo "--tag is required" >&2
+  exit 1
+fi
+
+if ! [[ "$K" =~ ^[1-9][0-9]*$ ]]; then
+  echo "-k must be a positive integer, got: $K" >&2
+  exit 1
+fi
+
+mkdir -p "$(dirname "$OUTPUT")"
+if [ ! -f "$OUTPUT" ]; then
+  printf 'commit\tpass@k\tpass^k\tavg_trial\tany_pass\tall_pass\ttrial_pass\ttotal_trials\ttasks\tk\tdescription\n' > "$OUTPUT"
+fi
+OUTPUT_REL="$(python3 -c 'import os,sys; print(os.path.relpath(sys.argv[1], sys.argv[2]))' "$OUTPUT" "$REPO_ROOT")"
+
+if [ "$USE_LOCAL_TARBALLS" = true ]; then
+  if ! command -v python3 >/dev/null 2>&1; then
+    echo "python3 is required for local tarball mode" >&2
+    exit 1
+  fi
+  echo "=== Preparing local tarballs ==="
+  bash "${REPO_ROOT}/benchmark/terminalbench/scripts/pack-local-tarballs.sh" --output-dir "$TARBALL_DIR"
+  REQUESTED_PORT="$TARBALL_PORT"
+  TARBALL_PORT="$(python3 - "$REQUESTED_PORT" <<'PY'
+import socket
+import sys
+
+start = int(sys.argv[1])
+for port in range(start, start + 50):
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        sock.bind(("0.0.0.0", port))
+    except OSError:
+        sock.close()
+        continue
+    sock.close()
+    print(port)
+    break
+else:
+    raise SystemExit("no free port found")
+PY
+)"
+  if [ "$TARBALL_PORT" != "$REQUESTED_PORT" ]; then
+    echo "Port $REQUESTED_PORT already in use; using $TARBALL_PORT instead."
+  fi
+  TARBALL_SERVER_LOG="$(mktemp)"
+  python3 -m http.server "$TARBALL_PORT" --bind 0.0.0.0 --directory "$TARBALL_DIR" \
+    >"$TARBALL_SERVER_LOG" 2>&1 &
+  TARBALL_SERVER_PID=$!
+  sleep 1
+  if ! kill -0 "$TARBALL_SERVER_PID" >/dev/null 2>&1; then
+    echo "failed to start tarball server on port $TARBALL_PORT" >&2
+    cat "$TARBALL_SERVER_LOG" >&2 || true
+    exit 1
+  fi
+  export OAS_LOCAL_TARBALL_URL="http://${TARBALL_HOST}:${TARBALL_PORT}"
+  echo "Local tarball URL: $OAS_LOCAL_TARBALL_URL"
+  echo ""
+fi
+
+ensure_harbor_registration
+
+if [ "$SKIP_TESTS" != true ]; then
+  if [ "$FULL_TESTS" = true ]; then
+    TEST_CMD=("bun" "test")
+  fi
+  echo "=== Running tests ==="
+  (
+    cd "$REPO_ROOT"
+    "${TEST_CMD[@]}"
+  )
+  echo ""
+fi
+
+echo "=== Running benchmark evaluation ==="
+"${REPO_ROOT}/benchmark/autoresearch/evaluate.sh" \
+  --tasks-file "$TASKS_FILE" \
+  --model "$MODEL" \
+  -k "$K" \
+  --sleep "$SLEEP_BETWEEN" \
+  --tag "$TAG" \
+  --output "$OUTPUT"
+echo ""
+
+DECISION_JSON="$(
+  python3 - "$OUTPUT" <<'PY'
+import csv
+import json
+import math
+import sys
+
+path = sys.argv[1]
+with open(path, newline="") as f:
+    rows = list(csv.DictReader(f, delimiter="\t"))
+
+if not rows:
+    raise SystemExit("results.tsv is empty after evaluation")
+
+cur = rows[-1]
+prev = rows[-2] if len(rows) >= 2 else None
+
+def f(row, key):
+    return float(row[key])
+
+result = {
+    "decision": "KEEP",
+    "reason": "first recorded run" if prev is None else "non-regression",
+    "current": cur,
+    "previous": prev,
+}
+
+if prev is not None:
+    cur_pass_at = f(cur, "pass@k")
+    prev_pass_at = f(prev, "pass@k")
+    cur_pass_pow = f(cur, "pass^k")
+    prev_pass_pow = f(prev, "pass^k")
+    cur_gap = cur_pass_at - cur_pass_pow
+    prev_gap = prev_pass_at - prev_pass_pow
+
+    if cur_pass_at > prev_pass_at:
+        result["decision"] = "KEEP"
+        result["reason"] = "pass@k improved"
+    elif cur_pass_at == prev_pass_at and cur_pass_pow > prev_pass_pow:
+        result["decision"] = "KEEP"
+        result["reason"] = "pass^k improved while pass@k held"
+    elif cur_pass_at < prev_pass_at:
+        result["decision"] = "REVERT"
+        result["reason"] = "pass@k regressed"
+    elif cur_pass_pow < prev_pass_pow:
+        result["decision"] = "REVERT"
+        result["reason"] = "pass^k regressed without pass@k gain"
+    elif cur_gap < prev_gap:
+        result["decision"] = "KEEP"
+        result["reason"] = "consistency gap narrowed"
+    else:
+        result["decision"] = "KEEP"
+        result["reason"] = "metrics held steady"
+
+print(json.dumps(result))
+PY
+)"
+
+DECISION="$(python3 -c 'import json,sys; print(json.loads(sys.argv[1])["decision"])' "$DECISION_JSON")"
+REASON="$(python3 -c 'import json,sys; print(json.loads(sys.argv[1])["reason"])' "$DECISION_JSON")"
+CURRENT_DESC="$(python3 -c 'import json,sys; print(json.loads(sys.argv[1])["current"]["description"])' "$DECISION_JSON")"
+
+echo "=== Decision ==="
+echo "Decision: $DECISION"
+echo "Reason:   $REASON"
+
+python3 - "$DECISION_JSON" <<'PY'
+import json
+import sys
+
+info = json.loads(sys.argv[1])
+cur = info["current"]
+prev = info.get("previous")
+
+print("Current:")
+print(f"  commit={cur['commit']} tag={cur['description']} pass@k={cur['pass@k']} pass^k={cur['pass^k']} avg_trial={cur['avg_trial']}")
+if prev:
+    print("Previous:")
+    print(f"  commit={prev['commit']} tag={prev['description']} pass@k={prev['pass@k']} pass^k={prev['pass^k']} avg_trial={prev['avg_trial']}")
+PY
+
+if [ "$DECISION" = "REVERT" ] && [ "$REVERT_ON_REGRESS" = true ]; then
+  echo ""
+  echo "=== Reverting latest code commit ==="
+  TRACKED_STATUS="$(git -C "$REPO_ROOT" status --porcelain --untracked-files=no)"
+  if [ -n "$TRACKED_STATUS" ]; then
+    OTHER_TRACKED="$(printf '%s\n' "$TRACKED_STATUS" | grep -v " ${OUTPUT_REL}$" || true)"
+    if [ -n "$OTHER_TRACKED" ]; then
+      echo "Skipping auto-revert because tracked files besides ${OUTPUT_REL} are modified:"
+      printf '%s\n' "$OTHER_TRACKED"
+      exit 0
+    fi
+  fi
+  RESULTS_BACKUP="$(mktemp)"
+  cp "$OUTPUT" "$RESULTS_BACKUP"
+  python3 - "$RESULTS_BACKUP" "$CURRENT_DESC" <<'PY'
+import pathlib
+import sys
+
+path = pathlib.Path(sys.argv[1])
+tag = sys.argv[2]
+lines = path.read_text().splitlines()
+if len(lines) < 2:
+    raise SystemExit(0)
+last = lines[-1].split("\t")
+if last[-1] == tag and "[REVERTED]" not in last[-1]:
+    last[-1] = f"{last[-1]} [REVERTED]"
+    lines[-1] = "\t".join(last)
+path.write_text("\n".join(lines) + "\n")
+PY
+  git -C "$REPO_ROOT" reset --hard HEAD~1
+  cp "$RESULTS_BACKUP" "$OUTPUT"
+  rm -f "$RESULTS_BACKUP"
+  echo "Reverted HEAD~1 and restored $OUTPUT with [REVERTED] marker."
+fi

From 2a28cfe5f0ad23a280d3ae8a7c3080fd549299fd Mon Sep 17 00:00:00 2001
From: octane0411 <wdznb1@gmail.com>
Date: Tue, 24 Mar 2026 11:11:50 +0800
Subject: [PATCH 3/4] chore(benchmark): disable overnight image cleanup by
 default

---
 benchmark/terminalbench/README.md                           | 6 ++++--
 .../terminalbench/scripts/run-terminalbench-overnight.sh    | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/benchmark/terminalbench/README.md b/benchmark/terminalbench/README.md
index 1c8c29a..58a2fac 100644
--- a/benchmark/terminalbench/README.md
+++ b/benchmark/terminalbench/README.md
@@ -107,12 +107,14 @@ chmod +x benchmark/terminalbench/scripts/*.sh
 
 ./benchmark/terminalbench/scripts/run-terminalbench-overnight.sh \
   --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt \
-  --batch-size 2 \
-  --keep-images 1 \
+  --batch-size 0 \
   --task-repeats 1 \
   --agent-timeout-multiplier 0.6
 ```
 
+`run-terminalbench-overnight.sh` now defaults to `--batch-size 0`, so terminal-bench
+images are kept unless you explicitly opt into periodic cleanup.
+
 Image cleanup only (manual):
 
 ```bash
diff --git a/benchmark/terminalbench/scripts/run-terminalbench-overnight.sh b/benchmark/terminalbench/scripts/run-terminalbench-overnight.sh
index d434d82..a29dbaf 100755
--- a/benchmark/terminalbench/scripts/run-terminalbench-overnight.sh
+++ b/benchmark/terminalbench/scripts/run-terminalbench-overnight.sh
@@ -6,7 +6,7 @@ MODEL="MiniMax-M2.5"
 ENV_TYPE="docker"
 AGENT_IMPORT_PATH="harbor.agents.installed.open_agent_sdk:OpenAgentSDKAgent"
 TASKS_FILE=""
-BATCH_SIZE=5
+BATCH_SIZE=0
 KEEP_IMAGES=2
 AGENT_TIMEOUT_MULTIPLIER="0.6"
 TIMEOUT_MULTIPLIER=""
@@ -25,7 +25,7 @@ Options:
   --dataset NAME             Harbor dataset (default: terminal-bench@2.0)
   --model NAME               Model name (default: MiniMax-M2.5)
   --env NAME                 Harbor env type (default: docker)
-  --batch-size N             Cleanup every N tasks (default: 5)
+  --batch-size N             Cleanup every N tasks; 0 disables cleanup (default: 0)
   --keep-images N            Keep newest N terminal-bench images (default: 2)
   --task-repeats K           Harbor -k value per task (default: 1)
   --agent-timeout-multiplier X
@@ -38,7 +38,7 @@ Options:
 Example:
   ./benchmark/terminalbench/scripts/run-terminalbench-overnight.sh \
     --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt \
-    --batch-size 3 --keep-images 1 --task-repeats 1
+    --batch-size 0 --task-repeats 1
 EOF
 }
 

From 172587e5c7608c77a5b7d5a6969a6f46ec7fc963 Mon Sep 17 00:00:00 2001
From: octane0411 <wdznb1@gmail.com>
Date: Tue, 24 Mar 2026 17:44:59 +0800
Subject: [PATCH 4/4] feat(benchmark): prewarm terminalbench verifier
 environments

---
 benchmark/autoresearch/run-experiment.sh      |  13 ++
 benchmark/terminalbench/prewarm-images.sh     |  88 +++++++++--
 .../scripts/patch-task-verifiers.sh           | 147 ++++++++++++++++++
 3 files changed, 236 insertions(+), 12 deletions(-)
 create mode 100644 benchmark/terminalbench/scripts/patch-task-verifiers.sh

diff --git a/benchmark/autoresearch/run-experiment.sh b/benchmark/autoresearch/run-experiment.sh
index bd6f2a8..03f32aa 100755
--- a/benchmark/autoresearch/run-experiment.sh
+++ b/benchmark/autoresearch/run-experiment.sh
@@ -114,6 +114,18 @@ PY
   echo ""
 }
 
+patch_cached_verifiers() {
+  local patcher="${REPO_ROOT}/benchmark/terminalbench/scripts/patch-task-verifiers.sh"
+  if [ ! -f "$patcher" ]; then
+    echo "WARN: verifier patcher not found: $patcher" >&2
+    return 0
+  fi
+
+  echo "=== Patching cached verifier scripts ==="
+  bash "$patcher" --tasks-file "$TASKS_FILE"
+  echo ""
+}
+
 if [ -z "$TAG" ]; then
   echo "--tag is required" >&2
   exit 1
@@ -176,6 +188,7 @@ PY
 fi
 
 ensure_harbor_registration
+patch_cached_verifiers
 
 if [ "$SKIP_TESTS" != true ]; then
   if [ "$FULL_TESTS" = true ]; then
diff --git a/benchmark/terminalbench/prewarm-images.sh b/benchmark/terminalbench/prewarm-images.sh
index d5cdecb..1be8386 100755
--- a/benchmark/terminalbench/prewarm-images.sh
+++ b/benchmark/terminalbench/prewarm-images.sh
@@ -24,6 +24,8 @@ FORCE=false
 RESTORE=false
 BACKUP_PREFIX="oas-original"
 PYPI_MIRROR="https://pypi.tuna.tsinghua.edu.cn/simple"
+PYPI_FALLBACK_INDEX="https://pypi.org/simple"
+UV_PREWARM_TIMEOUT="600"
 PACK_LOCAL_TARBALLS=false
 TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs"
 TARBALL_PORT="8765"
@@ -43,6 +45,10 @@ Options:
   --force              Force rebuild even if already pre-warmed
   --restore            Restore original images from backup
   --pypi-mirror URL    PyPI mirror for pytest install (default: tsinghua)
+  --pypi-fallback URL  Fallback PyPI index when the mirror misses packages
+                       (default: https://pypi.org/simple)
+  --uv-prewarm-timeout N
+                     Best-effort verifier prewarm timeout in seconds (default: 600)
   --pack-local-tarballs
                      Build repo-local SDK/CLI tarballs and serve them temporarily
   --tarball-dir DIR    Directory used for generated local tarballs
@@ -60,6 +66,8 @@ while (($#)); do
     --force) FORCE=true; shift ;;
     --restore) RESTORE=true; shift ;;
     --pypi-mirror) PYPI_MIRROR="${2:-}"; shift 2 ;;
+    --pypi-fallback) PYPI_FALLBACK_INDEX="${2:-}"; shift 2 ;;
+    --uv-prewarm-timeout) UV_PREWARM_TIMEOUT="${2:-}"; shift 2 ;;
     --pack-local-tarballs) PACK_LOCAL_TARBALLS=true; shift ;;
     --tarball-dir) TARBALL_DIR="${2:-}"; shift 2 ;;
     --tarball-port) TARBALL_PORT="${2:-}"; shift 2 ;;
@@ -205,19 +213,18 @@ while IFS= read -r image; do
   echo "[BUILD] $image"
   echo "  tasks: $tasks"
 
-  # Pull original if needed
-  if ! docker image inspect "$image" &>/dev/null; then
+  if docker image inspect "$backup" &>/dev/null; then
+    # Prefer the local backup to avoid unnecessary registry pulls on rebuild.
+    echo "  restoring original image from backup before rebuild"
+    docker tag "$backup" "$image"
+  elif ! docker image inspect "$image" &>/dev/null; then
     echo "  pulling $image ..."
     if ! docker pull "$image"; then
       echo "  FAIL: pull failed"
       FAILED=$((FAILED + 1))
       continue
     fi
-  fi
-
-  if docker image inspect "$backup" &>/dev/null; then
-    echo "  restoring original image from backup before rebuild"
-    docker tag "$backup" "$image"
+    docker tag "$image" "$backup"
   else
     # Backup original image on first pre-warm
     docker tag "$image" "$backup"
@@ -245,11 +252,68 @@ if ! command -v curl &>/dev/null; then
 fi
 curl -LsSf https://astral.sh/uv/0.9.5/install.sh | sh
 export PATH="\$HOME/.local/bin:\$PATH"
-UV_INDEX_URL="${PYPI_MIRROR}" UV_HTTP_TIMEOUT=300 uvx \\
-  -p 3.13 \\
-  -w pytest==8.4.1 \\
-  -w pytest-json-ctrf==0.3.5 \\
-  pytest --version
+
+install_verifier_packages() {
+  local python_bin="/opt/oas-verifier/bin/python"
+  local index=""
+  for index in "${PYPI_FALLBACK_INDEX}" "${PYPI_MIRROR}"; do
+    [ -z "\$index" ] && continue
+    echo "Installing verifier packages via \${index}"
+    if "\$python_bin" -m pip install \
+      --disable-pip-version-check \
+      --default-timeout 15 \
+      --retries 1 \
+      -i "\$index" \
+      pytest==8.4.1 \
+      pytest-json-ctrf==0.3.5; then
+      return 0
+    fi
+    echo "Verifier package install failed on \${index}, trying next index..."
+  done
+  return 1
+}
+
+build_verifier_env() {
+  rm -rf /opt/oas-verifier
+  UV_HTTP_TIMEOUT=300 uv venv --python 3.13 /opt/oas-verifier
+  /opt/oas-verifier/bin/python -m ensurepip >/dev/null
+  install_verifier_packages
+  /opt/oas-verifier/bin/python -m pytest --version
+  chmod -R a+rX /opt/oas-verifier || true
+}
+
+prewarm_verifier_env() {
+  local uv_pid=""
+  local watchdog_pid=""
+  local status=0
+
+  (
+    build_verifier_env
+  ) &
+  uv_pid=\$!
+
+  (
+    sleep "${UV_PREWARM_TIMEOUT}"
+    if kill -0 "\$uv_pid" >/dev/null 2>&1; then
+      echo "WARN: uv/pytest prewarm timed out after ${UV_PREWARM_TIMEOUT}s; continuing without verifier cache"
+      kill -TERM "\$uv_pid" >/dev/null 2>&1 || true
+      sleep 3
+      kill -KILL "\$uv_pid" >/dev/null 2>&1 || true
+    fi
+  ) &
+  watchdog_pid=\$!
+
+  wait "\$uv_pid" || status=\$?
+  kill "\$watchdog_pid" >/dev/null 2>&1 || true
+  wait "\$watchdog_pid" 2>/dev/null || true
+  return "\$status"
+}
+
+if prewarm_verifier_env; then
+  echo "Verifier environment ready: /opt/oas-verifier"
+else
+  echo "WARN: uv/pytest prewarm failed; continuing without /opt/oas-verifier"
+fi
 echo "=== Pre-warm complete ==="
 SETUP_FOOTER
 
diff --git a/benchmark/terminalbench/scripts/patch-task-verifiers.sh b/benchmark/terminalbench/scripts/patch-task-verifiers.sh
new file mode 100644
index 0000000..8eee49c
--- /dev/null
+++ b/benchmark/terminalbench/scripts/patch-task-verifiers.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+TASKS_FILE=""
+TASK_CACHE_ROOT="${HOME}/.cache/harbor/tasks"
+DRY_RUN=false
+
+usage() {
+  cat <<'EOF'
+Usage: patch-task-verifiers.sh --tasks-file FILE [options]
+
+Patch cached Harbor verifier scripts so they prefer the pre-warmed
+/opt/oas-verifier environment when it exists.
+
+Options:
+  --tasks-file FILE       Task list file to patch
+  --task-cache-root DIR   Harbor task cache root (default: ~/.cache/harbor/tasks)
+  --dry-run               Print what would be patched without modifying files
+  -h, --help              Show help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --tasks-file) TASKS_FILE="${2:-}"; shift 2 ;;
+    --task-cache-root) TASK_CACHE_ROOT="${2:-}"; shift 2 ;;
+    --dry-run) DRY_RUN=true; shift ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage; exit 1 ;;
+  esac
+done
+
+if [ -z "$TASKS_FILE" ]; then
+  echo "--tasks-file is required" >&2
+  exit 1
+fi
+
+python3 - "$TASKS_FILE" "$TASK_CACHE_ROOT" "$DRY_RUN" <<'PY'
+from pathlib import Path
+from typing import Optional
+import sys
+
+tasks_file = Path(sys.argv[1]).expanduser()
+task_cache_root = Path(sys.argv[2]).expanduser()
+dry_run = sys.argv[3].lower() == "true"
+
+marker = 'OAS_PREWARMED_VERIFIER_PYTHON="/opt/oas-verifier/bin/python"'
+pwd_check = 'if [ "$PWD" = "/" ]; then'
+
+if not tasks_file.is_file():
+    raise SystemExit(f"tasks file not found: {tasks_file}")
+
+if not task_cache_root.is_dir():
+    raise SystemExit(f"task cache root not found: {task_cache_root}")
+
+tasks: list[str] = []
+for raw in tasks_file.read_text().splitlines():
+    line = raw.strip()
+    if not line or line.startswith("#"):
+        continue
+    tasks.append(line)
+
+patched = 0
+skipped = 0
+warnings = 0
+
+def find_pytest_args(lines: list[str]) -> Optional[str]:
+    for line in lines:
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        if "pytest " not in stripped:
+            continue
+        return stripped.split("pytest ", 1)[1].strip()
+    return None
+
+def insertion_index(lines: list[str]) -> int:
+    idx = 0
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        if not stripped:
+            idx = i + 1
+            continue
+        if stripped.startswith("#") or stripped.startswith("#!") or stripped.startswith("export "):
+            idx = i + 1
+            continue
+        break
+    return idx
+
+for task in tasks:
+    matches = sorted(task_cache_root.glob(f"*/{task}/tests/test.sh"))
+    if not matches:
+        print(f"WARN: no cached verifier found for task '{task}'")
+        warnings += 1
+        continue
+
+    for path in matches:
+        original = path.read_text()
+        if marker in original:
+            print(f"SKIP: already patched {path}")
+            skipped += 1
+            continue
+
+        lines = original.splitlines()
+        pytest_args = find_pytest_args(lines)
+        if pytest_args is None:
+            print(f"WARN: could not find pytest invocation in {path}")
+            warnings += 1
+            continue
+
+        insert_at = insertion_index(lines)
+        snippet_lines = [
+            'OAS_PREWARMED_VERIFIER_PYTHON="/opt/oas-verifier/bin/python"',
+            'if [ -x "$OAS_PREWARMED_VERIFIER_PYTHON" ]; then',
+            f'  {pwd_check}',
+            '      echo "Error: No working directory set. Please set a WORKDIR in your Dockerfile before running this script."',
+            '      exit 1',
+            '  fi',
+            f'  "$OAS_PREWARMED_VERIFIER_PYTHON" -m pytest {pytest_args}',
+            '  pytest_status=$?',
+            '  if [ "$pytest_status" -eq 0 ]; then',
+            '    echo 1 > /logs/verifier/reward.txt',
+            '  else',
+            '    echo 0 > /logs/verifier/reward.txt',
+            '  fi',
+            '  exit 0',
+            'fi',
+            "",
+        ]
+        new_lines = lines[:insert_at] + snippet_lines + lines[insert_at:]
+        updated = "\n".join(new_lines) + "\n"
+
+        if dry_run:
+            print(f"PATCH: {path}")
+            patched += 1
+            continue
+
+        backup = path.with_suffix(path.suffix + ".oas-orig")
+        if not backup.exists():
+            backup.write_text(original)
+        path.write_text(updated)
+        print(f"PATCHED: {path}")
+        patched += 1
+
+print("")
+print(f"Patched: {patched}  Skipped: {skipped}  Warnings: {warnings}")
+PY