PrimeIntellect-ai · sethkarten · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,6 @@
+# Copy to .env and fill in values. .env is gitignored.
+HF_TOKEN=hf_...
+PRIME_API_KEY=...
+PRIME_TEAM_ID=...
+# Optional: Prime internal registry image (team-scoped). Leave unset to use public DockerHub fallback.
+PRIME_TOOLCHAIN_IMAGE=
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,3 +27,16 @@ repos:
         language: system
         pass_filenames: false
         stages: [pre-push]
+      - id: lint-agent
+        name: lint-agent
+        entry: bash -c 'SKILL="$(git rev-parse --show-toplevel)/../../Planning/.agents/skills/lint-agent/SKILL.md"; [ -f "$SKILL" ] || exit 0; for f in "$@"; do [ -f "$f" ] || continue; OUT=$(claude --print "$(cat $SKILL)" "$(cat $f)" 2>/dev/null); echo "$OUT"; echo "$OUT" | grep -q "^.*\[HIGH\]" && exit 1; done; exit 0' --
+        language: system
+        types_or: [python, pyi]
+        pass_filenames: true
+      - id: oo-agent
+        name: oo-agent
+        entry: bash -c '[ -n "$SKIP_OO_CHECK" ] && exit 0; SKILL="$(git rev-parse --show-toplevel)/../../Planning/.agents/skills/oo-agent/SKILL.md"; [ -f "$SKILL" ] || exit 0; for f in "$@"; do [ -f "$f" ] || continue; OUT=$(claude --print "$(cat $SKILL)" "$(cat $f)" 2>/dev/null); echo "$OUT"; echo "$OUT" | grep -q "^FAIL:" && exit 1; done; exit 0' --
+        language: system
+        types_or: [python, pyi]
+        files: ^environments/
+        pass_filenames: true
diff --git a/environments/programbench/README.md b/environments/programbench/README.md
@@ -0,0 +1,54 @@
+# programbench
+
+### Overview
+- **Environment ID**: `programbench`
+- **Short description**: Agent reconstructs compilable source code from an execute-only binary and its documentation, scored by hidden pytest tests.
+- **Tags**: reverse-engineering, multi-turn, sandbox, binary-analysis, eval
+
+### Datasets
+- **Primary dataset(s)**: `PrimeIntellect/programbench-processed` — 195 tasks (C, C++, Go, Rust) with README, binary on HF, and hidden pytest test archives
+- **Source links**: [ProgramBench paper](https://arxiv.org/abs/2503.13066)
+- **Split sizes**: 195 tasks (train split): 32 C, 11 C++, 46 Go, 106 Rust
+
+### Task
+- **Type**: multi-turn, tool use (mini-SWE-agent)
+- **Output format**: source files written to `/workspace/src/` + `compile.sh` that produces `/workspace/executable`
+- **Rubric overview**: `solved = n_tests_passed / n_tests_total` from hidden pytest suite; reward = solved (weight 1.0)
+
+### Quickstart
+
+```bash
+# Requires: HF_TOKEN (private dataset + test archives)
+# Model API key is handled by the verifiers runtime proxy.
+prime eval run programbench -m openai/gpt-4.1-mini -n 5 -r 1
+```
+
+Filter by language or difficulty:
+
+```bash
+prime eval run programbench -m openai/gpt-4.1 -n 20 -r 1 \
+  -a '{"filter_language": "go", "max_tasks": 10}'
+```
+
+### Environment Arguments
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `filter_language` | str | `null` | Restrict to `"c"`, `"cpp"`, `"go"`, or `"rust"` |
+| `filter_difficulty` | str | `null` | Restrict to a difficulty tier |
+| `max_tasks` | int | `null` | Cap number of tasks loaded |
+| `hide_tests_from_agent` | bool | `true` | Keep test archive hidden until scoring |
+| `dataset_name` | str | `PrimeIntellect/programbench-processed` | HF dataset ID |
+
+### Metrics
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` / `solved` | Fraction of hidden pytest tests passed (0–1) — RL training signal |
+| `resolved_binary` | 1 if all tests pass, 0 otherwise — primary paper metric (% Resolved) |
+| `compile_success` | Whether `compile.sh` produced an executable |
+| `compile_exit_code` | Exit code of the compile step |
+| `n_tests_passed` | Raw count of passing tests |
+| `n_tests_total` | Total tests in the hidden suite |
+| `pytest_log` | Last 4KB of pytest output |
+| `eval_error` | Set to `binary_wrap_detected` if submitted executable matches reference binary hash |
diff --git a/environments/programbench/data/go_subset.jsonl b/environments/programbench/data/go_subset.jsonl
diff --git a/environments/programbench/docker/Dockerfile b/environments/programbench/docker/Dockerfile
@@ -0,0 +1,64 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    CARGO_HOME=/root/.cargo \
+    RUSTUP_HOME=/root/.rustup \
+    GOPATH=/root/go \
+    GOROOT=/usr/local/go \
+    GO_VERSION=1.26.3 \
+    PATH="/root/.cargo/bin:/usr/local/go/bin:/root/go/bin:${PATH}"
+
+# System packages: C/C++ toolchain, analysis tools, Python, general utilities
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        gcc \
+        g++ \
+        clang \
+        cmake \
+        make \
+        binutils \
+        file \
+        python3 \
+        python3-pip \
+        python3-dev \
+        tmux \
+        git \
+        curl \
+        wget \
+        tar \
+        bash \
+        ca-certificates \
+        pkg-config \
+        libssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Python tooling
+RUN pip3 install --no-cache-dir \
+    pytest \
+    pytest-xdist \
+    junitparser
+
+# Go 1.22.x
+RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" -o /tmp/go.tar.gz && \
+    tar -C /usr/local -xzf /tmp/go.tar.gz && \
+    rm /tmp/go.tar.gz
+
+# Rust via rustup (stable, no default toolchain prompt)
+RUN curl -fsSL https://sh.rustup.rs -o /tmp/rustup-init.sh && \
+    sh /tmp/rustup-init.sh -y --default-toolchain stable --profile minimal --no-modify-path && \
+    rm /tmp/rustup-init.sh && \
+    /root/.cargo/bin/rustup component add rustfmt clippy
+
+# Pre-warm cargo registry with common ProgramBench dependencies.
+# We build both debug and release profiles so that all dep artifacts are cached.
+# The /tmp/warmup source tree is deleted afterwards; ~/.cargo/registry persists.
+COPY warmup/ /tmp/warmup/
+RUN cd /tmp/warmup && \
+    /root/.cargo/bin/cargo build 2>&1 && \
+    /root/.cargo/bin/cargo build --release 2>&1 && \
+    cd / && rm -rf /tmp/warmup
+
+# Workspace directory used by sandbox rollouts
+RUN mkdir -p /workspace
+
+WORKDIR /workspace
diff --git a/environments/programbench/docker/README.md b/environments/programbench/docker/README.md
@@ -0,0 +1,46 @@
+# programbench-toolchain
+
+Docker image providing the multi-language toolchain for ProgramBench evaluation rollouts in prime-sandboxes.
+
+## Contents
+
+| Component | Version |
+|-----------|---------|
+| Base OS | Ubuntu 22.04 |
+| Rust | stable (via rustup) |
+| Go | 1.22.5 |
+| C/C++ | gcc, g++, clang (Ubuntu 22.04 defaults, ~11/14) |
+| Build tools | cmake, make |
+| Analysis | binutils (strings, nm, objdump), file |
+| Python | python3 + pip, pytest, pytest-xdist, junitparser |
+| General | git, curl, wget, tar, bash |
+
+The cargo registry is pre-warmed with: `clap`, `serde`, `serde_json`, `anyhow`, `tokio`, `regex`, `thiserror` (both debug and release profiles). This avoids 500 MB–2 GB of network downloads per rollout.
+
+## Build and push
+
+```bash
+# One-time setup (only needed once per Docker host)
+docker login
+docker buildx create --use
+
+# Build and push
+./build.sh
+
+# Build with a specific tag
+TAG=v1.0 ./build.sh
+```
+
+## Expected image size
+
+| Layer | Approx. size |
+|-------|-------------|
+| Ubuntu 22.04 base + apt packages | ~350 MB |
+| Python + pytest deps | ~50 MB |
+| Go 1.22 toolchain | ~500 MB |
+| Rust toolchain (rustup + stable) | ~800 MB |
+| Pre-warmed cargo registry | ~700 MB |
+| **Total (compressed)** | **~2–2.5 GB** |
+| **Total (uncompressed on disk)** | **~4–5 GB** |
+
+The pre-warmed cargo layer adds ~700 MB compressed but eliminates multi-gigabyte downloads at eval time, which is the right trade-off for a long-lived sandbox image.
diff --git a/environments/programbench/docker/build.sh b/environments/programbench/docker/build.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Build and push the ProgramBench toolchain image.
+# Requires: docker login (to push to Docker Hub or internal registry)
+#
+# Usage:
+#   ./build.sh                     # build + push latest
+#   TAG=v1.2 ./build.sh            # build + push with custom tag
+#
+# Note: buildx with --platform linux/amd64 requires Docker Buildx and QEMU
+# if you are building from an ARM host (e.g. Apple Silicon Mac).
+# Run once to set up: docker buildx create --use
+
+set -euo pipefail
+
+IMAGE="primeintellect/programbench-toolchain"
+TAG="${TAG:-latest}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Building ${IMAGE}:${TAG} for linux/amd64 ..."
+
+docker buildx build \
+    --platform linux/amd64 \
+    --tag "${IMAGE}:${TAG}" \
+    --push \
+    "${SCRIPT_DIR}"
+
+echo "Pushed ${IMAGE}:${TAG}"
diff --git a/environments/programbench/docker/warmup/Cargo.toml b/environments/programbench/docker/warmup/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "warmup"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+clap = { version = "4", features = ["derive"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+anyhow = "1"
+tokio = { version = "1", features = ["full"] }
+regex = "1"
+thiserror = "1"
+
+[[bin]]
+name = "warmup"
+path = "src/main.rs"
diff --git a/environments/programbench/docker/warmup/src/main.rs b/environments/programbench/docker/warmup/src/main.rs
@@ -0,0 +1,35 @@
+use anyhow::Result;
+use clap::Parser;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use thiserror::Error;
+
+#[derive(Parser)]
+struct Cli {
+    #[arg(short, long, default_value = "world")]
+    name: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Greeting {
+    message: String,
+}
+
+#[derive(Debug, Error)]
+enum AppError {
+    #[error("invalid name: {0}")]
+    InvalidName(String),
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let cli = Cli::parse();
+    let re = Regex::new(r"^\w+$")?;
+    if !re.is_match(&cli.name) {
+        return Err(AppError::InvalidName(cli.name).into());
+    }
+    let greeting = Greeting { message: format!("Hello, {}!", cli.name) };
+    println!("{}", json!(greeting));
+    Ok(())
+}