Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion environments/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ This folder contains installable example environments that showcase common usage
- **bfcl_v3**: BFCL v3 function-calling eval using task-local dynamic tool schemas and v1 rewards.
- **dspy_flights**: Sandboxed DSPy flight-support `program.fn` entrypoint installed from its package `pyproject.toml` and configured against the v1 interception endpoint.
- **hello_group_reward_v1**: Deterministic v1 reference for group updates, metrics, rewards, advantages, and cleanup.
- **swe_bench_verified**: SWE-bench Verified patch-generation taskset with deterministic gold-patch reward and metadata for downstream execution harnesses.
- **tau2_bench_v1**: `tau2-bench-v1` τ²-bench taskset/user/tool pattern on the v1 harness runtime.

### Composition
Expand Down Expand Up @@ -94,7 +95,7 @@ This folder contains installable example environments that showcase common usage
- **CLI agent sandboxes**: `opencode_harbor`, `terminus_harbor`, `hello_mcp_harbor`
- **MCP integration**: `mcp_search_env`, `hello_mcp_harbor`
- **RLM (recursive LLM)**: `rlm_secrets`
- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `dspy_flights`, and `tau2-bench-v1`.
- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `swe_bench_verified`, `dspy_flights`, and `tau2-bench-v1`.
- `opencode_harbor` uses the packaged `vf.HarborTaskset` + `vf.OpenCode` boundary. These reusable implementations live under `verifiers.v1.packages` and are re-exported from `verifiers.v1`.
- **Environment and rubric composition**: `math_group`, `math_python`, `wiki_search`
- **Procedural datasets**: `reasoning_gym_env`
Expand Down
28 changes: 28 additions & 0 deletions environments/swe_bench_verified/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# swe-bench-verified

Patch-generation environment for `princeton-nlp/SWE-bench_Verified`.

```python
from swe_bench_verified import load_environment

env = load_environment()
```

The taskset loads the 500-example SWE-bench Verified split from Hugging Face,
formats each instance as a repository repair prompt, and asks the model to
return a unified diff inside `<patch>...</patch>` tags.

The default reward is intentionally deterministic and local: it compares the
normalized submitted patch to the gold patch included in the dataset. This makes
the environment useful for SFT/RL sanity checks and reward-model experiments
without requiring per-instance Docker images. It is not a replacement for the
official SWE-bench execution harness; `test_patch`, `FAIL_TO_PASS`, and
`PASS_TO_PASS` are preserved in `task["info"]` so downstream harnesses can run
execution-based validation when available.

For handoff into the official harness, `official_submission(task, patch)` returns
the expected JSONL row shape:

```json
{"instance_id": "astropy__astropy-12907", "model_patch": "diff --git ..."}
```
28 changes: 28 additions & 0 deletions environments/swe_bench_verified/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[project]
name = "swe-bench-verified"
version = "0.1.0"
description = "SWE-bench Verified patch-generation environment"
license = "Apache-2.0"
tags = ["swe", "coding", "patch", "swe-bench"]
requires-python = ">=3.11"
dependencies = [
"datasets",
"verifiers",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["swe_bench_verified.py", "README.md", "pyproject.toml"]

[project.entry-points."verifiers.environments"]
swe-bench-verified = "swe_bench_verified:load_environment"

[tool.verifiers.eval]
num_examples = 10
rollouts_per_example = 1

[tool.uv.sources]
verifiers = { path = "../..", editable = true }
268 changes: 268 additions & 0 deletions environments/swe_bench_verified/swe_bench_verified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
from __future__ import annotations

import re
from collections.abc import Mapping
from difflib import SequenceMatcher
from typing import Any

from datasets import Dataset, load_dataset

import verifiers.v1 as vf

DEFAULT_DATASET_NAME = "princeton-nlp/SWE-bench_Verified"
DEFAULT_SPLIT = "test"
TASKSET_ID = "swe-bench/verified"

SYSTEM_PROMPT = """\
You are repairing a real GitHub repository issue from SWE-bench Verified.
Return only a unified diff that applies to the repository at the specified base
commit. Wrap the diff in <patch>...</patch> tags."""


def format_prompt(row: Mapping[str, Any]) -> str:
hints = str(row.get("hints_text") or "").strip()
hints_block = f"\n\nHints:\n{hints}" if hints else ""
return (
f"Repository: {row['repo']}\n"
f"Instance: {row['instance_id']}\n"
f"Base commit: {row['base_commit']}\n"
f"Difficulty: {row.get('difficulty') or 'unknown'}\n\n"
"Problem statement:\n"
f"{str(row['problem_statement']).strip()}"
f"{hints_block}\n\n"
"Return the minimal source-code patch as a unified diff."
)


def build_record(row: Mapping[str, Any]) -> dict[str, Any]:
info_keys = (
"repo",
"instance_id",
"base_commit",
"test_patch",
"FAIL_TO_PASS",
"PASS_TO_PASS",
"environment_setup_commit",
"difficulty",
"created_at",
"version",
)
info = {key: row.get(key) for key in info_keys if key in row}
return {
"task_id": row["instance_id"],
"prompt": [{"role": "user", "content": format_prompt(row)}],
"question": format_prompt(row),
"answer": row["patch"],
"info": info,
}


def load_rows(
dataset_name: str,
split: str,
limit: int | None,
repos: list[str] | None,
difficulties: list[str] | None,
keep_in_memory: bool,
) -> Dataset:
dataset = load_dataset(dataset_name, split=split, keep_in_memory=keep_in_memory)
if repos:
allowed_repos = frozenset(repos)
dataset = dataset.filter(lambda row: row["repo"] in allowed_repos)
if difficulties:
allowed_difficulties = frozenset(difficulties)
dataset = dataset.filter(
lambda row: str(row.get("difficulty") or "") in allowed_difficulties
)
if limit is not None and limit >= 0:
dataset = dataset.select(range(min(limit, len(dataset))))
return dataset.map(build_record, remove_columns=dataset.column_names)


def extract_patch(completion: object) -> str:
text = completion_to_text(completion)
match = re.search(r"<patch>\s*(.*?)\s*</patch>", text, flags=re.DOTALL)
return match.group(1).strip() if match else text.strip()


def completion_to_text(completion: object) -> str:
if isinstance(completion, str):
return completion
if isinstance(completion, list):
parts: list[str] = []
for item in completion:
if isinstance(item, Mapping):
content = item.get("content")
if isinstance(content, str):
parts.append(content)
elif item is not None:
parts.append(str(item))
return "\n".join(parts)
if completion is None:
return ""
return str(completion)


def normalize_patch(patch: str) -> str:
lines = []
for raw_line in patch.replace("\r\n", "\n").splitlines():
line = raw_line.rstrip()
if line.startswith("index "):
continue
lines.append(line)
while lines and not lines[0]:
lines.pop(0)
while lines and not lines[-1]:
lines.pop()
return "\n".join(lines) + ("\n" if lines else "")


def patch_file_paths(patch: str) -> set[str]:
paths: set[str] = set()
for line in patch.replace("\r\n", "\n").splitlines():
if line.startswith("diff --git "):
parts = line.split()
if len(parts) >= 4:
paths.add(strip_diff_prefix(parts[3]))
elif line.startswith("+++ ") and not line.startswith("+++ /dev/null"):
paths.add(strip_diff_prefix(line[4:].strip()))
return {path for path in paths if path}


def strip_diff_prefix(path: str) -> str:
if path.startswith("a/") or path.startswith("b/"):
return path[2:]
return path


def official_submission(task: Mapping[str, Any], patch: str) -> dict[str, str]:
"""Return the JSONL row shape expected by the official SWE-bench harness."""
return {
"instance_id": str(task["task_id"]),
"model_patch": normalize_patch(patch),
}


async def exact_patch(task: vf.Task, state: vf.State) -> float:
expected = normalize_patch(str(task["answer"]))
actual = normalize_patch(extract_patch(state.get("completion")))
return 1.0 if actual == expected else 0.0


async def patch_similarity(task: vf.Task, state: vf.State) -> float:
expected = normalize_patch(str(task["answer"]))
actual = normalize_patch(extract_patch(state.get("completion")))
if not expected or not actual:
return 0.0
return SequenceMatcher(None, actual, expected).ratio()


async def patch_line_count(task: vf.Task, state: vf.State) -> float:
patch = normalize_patch(extract_patch(state.get("completion")))
return float(len([line for line in patch.splitlines() if line]))


async def gold_patch_line_count(task: vf.Task, state: vf.State) -> float:
patch = normalize_patch(str(task["answer"]))
return float(len([line for line in patch.splitlines() if line]))


async def changed_file_overlap(task: vf.Task, state: vf.State) -> float:
expected = patch_file_paths(str(task["answer"]))
actual = patch_file_paths(extract_patch(state.get("completion")))
if not expected or not actual:
return 0.0
return len(expected & actual) / len(expected | actual)


def load_taskset(
dataset_name: str = DEFAULT_DATASET_NAME,
split: str = DEFAULT_SPLIT,
eval_split: str | None = None,
train_limit: int | None = None,
eval_limit: int | None = None,
repos: list[str] | None = None,
difficulties: list[str] | None = None,
keep_in_memory: bool = True,
system_prompt: str = SYSTEM_PROMPT,
exact_weight: float = 1.0,
similarity_weight: float = 0.0,
config: vf.TasksetConfig | None = None,
) -> vf.Taskset:
def build_train() -> Dataset:
return load_rows(
dataset_name,
split,
train_limit,
repos,
difficulties,
keep_in_memory,
)

def build_eval() -> Dataset:
return load_rows(
dataset_name,
eval_split or split,
eval_limit,
repos,
difficulties,
keep_in_memory,
)

rewards = []
metrics = [
patch_similarity,
changed_file_overlap,
patch_line_count,
gold_patch_line_count,
]
if exact_weight > 0:
rewards.append(vf.reward(weight=exact_weight)(exact_patch))
else:
metrics.insert(0, exact_patch)
if similarity_weight > 0:
rewards.append(vf.reward(weight=similarity_weight)(patch_similarity))

return vf.Taskset(
source=build_train,
eval_source=build_eval,
taskset_id=TASKSET_ID,
system_prompt=system_prompt,
rewards=rewards,
metrics=metrics,
config=config,
)


def load_environment(
dataset_name: str = DEFAULT_DATASET_NAME,
split: str = DEFAULT_SPLIT,
eval_split: str | None = None,
train_limit: int | None = None,
eval_limit: int | None = None,
repos: list[str] | None = None,
difficulties: list[str] | None = None,
keep_in_memory: bool = True,
system_prompt: str = SYSTEM_PROMPT,
exact_weight: float = 1.0,
similarity_weight: float = 0.0,
config: vf.EnvConfig | None = None,
) -> vf.Env:
config = config or vf.EnvConfig()
return vf.Env(
taskset=load_taskset(
dataset_name=dataset_name,
split=split,
eval_split=eval_split,
train_limit=train_limit,
eval_limit=eval_limit,
repos=repos,
difficulties=difficulties,
keep_in_memory=keep_in_memory,
system_prompt=system_prompt,
exact_weight=exact_weight,
similarity_weight=similarity_weight,
config=config.taskset,
),
harness=vf.Harness(config=config.harness),
)
Loading