PrimeIntellect-ai · poofeth · May 11, 2026 · May 11, 2026
diff --git a/environments/README.md b/environments/README.md
@@ -60,6 +60,7 @@ This folder contains installable example environments that showcase common usage
   - **bfcl_v3**: BFCL v3 function-calling eval using task-local dynamic tool schemas and v1 rewards.
   - **dspy_flights**: Sandboxed DSPy flight-support `program.fn` entrypoint installed from its package `pyproject.toml` and configured against the v1 interception endpoint.
   - **hello_group_reward_v1**: Deterministic v1 reference for group updates, metrics, rewards, advantages, and cleanup.
+  - **swe_bench_verified**: SWE-bench Verified patch-generation taskset with deterministic gold-patch reward and metadata for downstream execution harnesses.
   - **tau2_bench_v1**: `tau2-bench-v1` τ²-bench taskset/user/tool pattern on the v1 harness runtime.
 
 ### Composition
@@ -94,7 +95,7 @@ This folder contains installable example environments that showcase common usage
 - **CLI agent sandboxes**: `opencode_harbor`, `terminus_harbor`, `hello_mcp_harbor`
 - **MCP integration**: `mcp_search_env`, `hello_mcp_harbor`
 - **RLM (recursive LLM)**: `rlm_secrets`
-- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `dspy_flights`, and `tau2-bench-v1`.
+- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `swe_bench_verified`, `dspy_flights`, and `tau2-bench-v1`.
   - `opencode_harbor` uses the packaged `vf.HarborTaskset` + `vf.OpenCode` boundary. These reusable implementations live under `verifiers.v1.packages` and are re-exported from `verifiers.v1`.
 - **Environment and rubric composition**: `math_group`, `math_python`, `wiki_search`
 - **Procedural datasets**: `reasoning_gym_env`

diff --git a/environments/swe_bench_verified/README.md b/environments/swe_bench_verified/README.md
@@ -0,0 +1,28 @@
+# swe-bench-verified
+
+Patch-generation environment for `princeton-nlp/SWE-bench_Verified`.
+
+```python
+from swe_bench_verified import load_environment
+
+env = load_environment()
+```
+
+The taskset loads the 500-example SWE-bench Verified split from Hugging Face,
+formats each instance as a repository repair prompt, and asks the model to
+return a unified diff inside `<patch>...</patch>` tags.
+
+The default reward is intentionally deterministic and local: it compares the
+normalized submitted patch to the gold patch included in the dataset. This makes
+the environment useful for SFT/RL sanity checks and reward-model experiments
+without requiring per-instance Docker images. It is not a replacement for the
+official SWE-bench execution harness; `test_patch`, `FAIL_TO_PASS`, and
+`PASS_TO_PASS` are preserved in `task["info"]` so downstream harnesses can run
+execution-based validation when available.
+
+For handoff into the official harness, `official_submission(task, patch)` returns
+the expected JSONL row shape:
+
+```json
+{"instance_id": "astropy__astropy-12907", "model_patch": "diff --git ..."}
+```
diff --git a/environments/swe_bench_verified/pyproject.toml b/environments/swe_bench_verified/pyproject.toml
@@ -0,0 +1,28 @@
+[project]
+name = "swe-bench-verified"
+version = "0.1.0"
+description = "SWE-bench Verified patch-generation environment"
+license = "Apache-2.0"
+tags = ["swe", "coding", "patch", "swe-bench"]
+requires-python = ">=3.11"
+dependencies = [
+    "datasets",
+    "verifiers",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["swe_bench_verified.py", "README.md", "pyproject.toml"]
+
+[project.entry-points."verifiers.environments"]
+swe-bench-verified = "swe_bench_verified:load_environment"
+
+[tool.verifiers.eval]
+num_examples = 10
+rollouts_per_example = 1
+
+[tool.uv.sources]
+verifiers = { path = "../..", editable = true }
diff --git a/environments/swe_bench_verified/swe_bench_verified.py b/environments/swe_bench_verified/swe_bench_verified.py
@@ -0,0 +1,268 @@
+from __future__ import annotations
+
+import re
+from collections.abc import Mapping
+from difflib import SequenceMatcher
+from typing import Any
+
+from datasets import Dataset, load_dataset
+
+import verifiers.v1 as vf
+
+DEFAULT_DATASET_NAME = "princeton-nlp/SWE-bench_Verified"
+DEFAULT_SPLIT = "test"
+TASKSET_ID = "swe-bench/verified"
+
+SYSTEM_PROMPT = """\
+You are repairing a real GitHub repository issue from SWE-bench Verified.
+Return only a unified diff that applies to the repository at the specified base
+commit. Wrap the diff in <patch>...</patch> tags."""
+
+
+def format_prompt(row: Mapping[str, Any]) -> str:
+    hints = str(row.get("hints_text") or "").strip()
+    hints_block = f"\n\nHints:\n{hints}" if hints else ""
+    return (
+        f"Repository: {row['repo']}\n"
+        f"Instance: {row['instance_id']}\n"
+        f"Base commit: {row['base_commit']}\n"
+        f"Difficulty: {row.get('difficulty') or 'unknown'}\n\n"
+        "Problem statement:\n"
+        f"{str(row['problem_statement']).strip()}"
+        f"{hints_block}\n\n"
+        "Return the minimal source-code patch as a unified diff."
+    )
+
+
+def build_record(row: Mapping[str, Any]) -> dict[str, Any]:
+    info_keys = (
+        "repo",
+        "instance_id",
+        "base_commit",
+        "test_patch",
+        "FAIL_TO_PASS",
+        "PASS_TO_PASS",
+        "environment_setup_commit",
+        "difficulty",
+        "created_at",
+        "version",
+    )
+    info = {key: row.get(key) for key in info_keys if key in row}
+    return {
+        "task_id": row["instance_id"],
+        "prompt": [{"role": "user", "content": format_prompt(row)}],
+        "question": format_prompt(row),
+        "answer": row["patch"],
+        "info": info,
+    }
+
+
+def load_rows(
+    dataset_name: str,
+    split: str,
+    limit: int | None,
+    repos: list[str] | None,
+    difficulties: list[str] | None,
+    keep_in_memory: bool,
+) -> Dataset:
+    dataset = load_dataset(dataset_name, split=split, keep_in_memory=keep_in_memory)
+    if repos:
+        allowed_repos = frozenset(repos)
+        dataset = dataset.filter(lambda row: row["repo"] in allowed_repos)
+    if difficulties:
+        allowed_difficulties = frozenset(difficulties)
+        dataset = dataset.filter(
+            lambda row: str(row.get("difficulty") or "") in allowed_difficulties
+        )
+    if limit is not None and limit >= 0:
+        dataset = dataset.select(range(min(limit, len(dataset))))
+    return dataset.map(build_record, remove_columns=dataset.column_names)
+
+
+def extract_patch(completion: object) -> str:
+    text = completion_to_text(completion)
+    match = re.search(r"<patch>\s*(.*?)\s*</patch>", text, flags=re.DOTALL)
+    return match.group(1).strip() if match else text.strip()
+
+
+def completion_to_text(completion: object) -> str:
+    if isinstance(completion, str):
+        return completion
+    if isinstance(completion, list):
+        parts: list[str] = []
+        for item in completion:
+            if isinstance(item, Mapping):
+                content = item.get("content")
+                if isinstance(content, str):
+                    parts.append(content)
+            elif item is not None:
+                parts.append(str(item))
+        return "\n".join(parts)
+    if completion is None:
+        return ""
+    return str(completion)
+
+
+def normalize_patch(patch: str) -> str:
+    lines = []
+    for raw_line in patch.replace("\r\n", "\n").splitlines():
+        line = raw_line.rstrip()
+        if line.startswith("index "):
+            continue
+        lines.append(line)
+    while lines and not lines[0]:
+        lines.pop(0)
+    while lines and not lines[-1]:
+        lines.pop()
+    return "\n".join(lines) + ("\n" if lines else "")
+
+
+def patch_file_paths(patch: str) -> set[str]:
+    paths: set[str] = set()
+    for line in patch.replace("\r\n", "\n").splitlines():
+        if line.startswith("diff --git "):
+            parts = line.split()
+            if len(parts) >= 4:
+                paths.add(strip_diff_prefix(parts[3]))
+        elif line.startswith("+++ ") and not line.startswith("+++ /dev/null"):
+            paths.add(strip_diff_prefix(line[4:].strip()))
+    return {path for path in paths if path}
+
+
+def strip_diff_prefix(path: str) -> str:
+    if path.startswith("a/") or path.startswith("b/"):
+        return path[2:]
+    return path
+
+
+def official_submission(task: Mapping[str, Any], patch: str) -> dict[str, str]:
+    """Return the JSONL row shape expected by the official SWE-bench harness."""
+    return {
+        "instance_id": str(task["task_id"]),
+        "model_patch": normalize_patch(patch),
+    }
+
+
+async def exact_patch(task: vf.Task, state: vf.State) -> float:
+    expected = normalize_patch(str(task["answer"]))
+    actual = normalize_patch(extract_patch(state.get("completion")))
+    return 1.0 if actual == expected else 0.0
+
+
+async def patch_similarity(task: vf.Task, state: vf.State) -> float:
+    expected = normalize_patch(str(task["answer"]))
+    actual = normalize_patch(extract_patch(state.get("completion")))
+    if not expected or not actual:
+        return 0.0
+    return SequenceMatcher(None, actual, expected).ratio()
+
+
+async def patch_line_count(task: vf.Task, state: vf.State) -> float:
+    patch = normalize_patch(extract_patch(state.get("completion")))
+    return float(len([line for line in patch.splitlines() if line]))
+
+
+async def gold_patch_line_count(task: vf.Task, state: vf.State) -> float:
+    patch = normalize_patch(str(task["answer"]))
+    return float(len([line for line in patch.splitlines() if line]))
+
+
+async def changed_file_overlap(task: vf.Task, state: vf.State) -> float:
+    expected = patch_file_paths(str(task["answer"]))
+    actual = patch_file_paths(extract_patch(state.get("completion")))
+    if not expected or not actual:
+        return 0.0
+    return len(expected & actual) / len(expected | actual)
+
+
+def load_taskset(
+    dataset_name: str = DEFAULT_DATASET_NAME,
+    split: str = DEFAULT_SPLIT,
+    eval_split: str | None = None,
+    train_limit: int | None = None,
+    eval_limit: int | None = None,
+    repos: list[str] | None = None,
+    difficulties: list[str] | None = None,
+    keep_in_memory: bool = True,
+    system_prompt: str = SYSTEM_PROMPT,
+    exact_weight: float = 1.0,
+    similarity_weight: float = 0.0,
+    config: vf.TasksetConfig | None = None,
+) -> vf.Taskset:
+    def build_train() -> Dataset:
+        return load_rows(
+            dataset_name,
+            split,
+            train_limit,
+            repos,
+            difficulties,
+            keep_in_memory,
+        )
+
+    def build_eval() -> Dataset:
+        return load_rows(
+            dataset_name,
+            eval_split or split,
+            eval_limit,
+            repos,
+            difficulties,
+            keep_in_memory,
+        )
+
+    rewards = []
+    metrics = [
+        patch_similarity,
+        changed_file_overlap,
+        patch_line_count,
+        gold_patch_line_count,
+    ]
+    if exact_weight > 0:
+        rewards.append(vf.reward(weight=exact_weight)(exact_patch))
+    else:
+        metrics.insert(0, exact_patch)
+    if similarity_weight > 0:
+        rewards.append(vf.reward(weight=similarity_weight)(patch_similarity))
+
+    return vf.Taskset(
+        source=build_train,
+        eval_source=build_eval,
+        taskset_id=TASKSET_ID,
+        system_prompt=system_prompt,
+        rewards=rewards,
+        metrics=metrics,
+        config=config,
+    )
+
+
+def load_environment(
+    dataset_name: str = DEFAULT_DATASET_NAME,
+    split: str = DEFAULT_SPLIT,
+    eval_split: str | None = None,
+    train_limit: int | None = None,
+    eval_limit: int | None = None,
+    repos: list[str] | None = None,
+    difficulties: list[str] | None = None,
+    keep_in_memory: bool = True,
+    system_prompt: str = SYSTEM_PROMPT,
+    exact_weight: float = 1.0,
+    similarity_weight: float = 0.0,
+    config: vf.EnvConfig | None = None,
+) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(
+        taskset=load_taskset(
+            dataset_name=dataset_name,
+            split=split,
+            eval_split=eval_split,
+            train_limit=train_limit,
+            eval_limit=eval_limit,
+            repos=repos,
+            difficulties=difficulties,
+            keep_in_memory=keep_in_memory,
+            system_prompt=system_prompt,
+            exact_weight=exact_weight,
+            similarity_weight=similarity_weight,
+            config=config.taskset,
+        ),
+        harness=vf.Harness(config=config.harness),
+    )