diff --git a/.semgrep/verifiers.yml b/.semgrep/verifiers.yml index 2485f4b5d..853a98698 100644 --- a/.semgrep/verifiers.yml +++ b/.semgrep/verifiers.yml @@ -5,10 +5,19 @@ rules: message: Do not use `from __future__ import annotations`; quote only the specific forward references that need it. pattern: from __future__ import annotations + - id: verifiers-no-skip-validation + languages: [python] + severity: ERROR + message: SkipValidation hides type errors; use precise types and validators, or add a narrow nosemgrep waiver with a protocol-boundary rationale. + pattern-either: + - pattern: SkipValidation[$TYPE] + - pattern: $P.SkipValidation[$TYPE] + - pattern: from pydantic import SkipValidation + - id: verifiers-v1-config-param-one-type languages: [python] severity: ERROR - message: Public v1 `config` parameters must be one concrete config type or `None`; keep raw mappings at explicit config-loader boundaries. + message: Public v1 `config` parameters must be one concrete config type; keep raw mappings at explicit config-loader boundaries. paths: include: - /verifiers/v1/**/*.py @@ -24,6 +33,27 @@ rules: metavariable: $ANNOT regex: "(Any|ConfigMap|Mapping\\[str, object\\]|dict\\[str, object\\]|.*\\|.*\\|.*)" + - id: verifiers-v1-loaders-require-config + languages: [python] + severity: ERROR + message: Public `load_taskset` and `load_harness` loaders must require one concrete config object; defaults are supplied by the typed EnvConfig. + paths: + include: + - /environments/**/*.py + pattern-either: + - pattern: | + def load_taskset(..., config: $CONFIG | None = None, ...): + ... + - pattern: | + def load_harness(..., config: $CONFIG | None = None, ...): + ... + - pattern: | + def load_taskset(..., config: Optional[$CONFIG] = None, ...): + ... + - pattern: | + def load_harness(..., config: Optional[$CONFIG] = None, ...): + ... + - id: verifiers-no-private-framework-classes languages: [python] severity: ERROR diff --git a/README.md b/README.md index 40f64d333..461d6ecbd 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali ```bash prime env init my-env # creates a new template in ./environments/my_env ``` +Add an explicit harness loader when the environment owns harness behavior: +```bash +prime env init my-env --with-harness +``` For OpenEnv integration, use: ```bash prime env init my-openenv --openenv @@ -116,7 +120,9 @@ environments/my_env/ └── README.md # Documentation ``` -Environment modules should expose a `load_environment` function which returns an instance of the Environment object, and which can accept custom arguments. For example: +Environment modules should expose a `load_environment` function which returns an +environment object. For simple legacy environments, this can still be a direct +constructor: ```python # my_env.py import verifiers as vf @@ -148,7 +154,7 @@ def source(): async def contains_answer(task, state) -> float: return float(task["answer"] in str(state.get("completion") or "")) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[contains_answer], config=config) def load_environment(config: vf.EnvConfig) -> vf.Env: @@ -169,8 +175,8 @@ env = vf.Env( ``` The same environment package is the unit used by evals and `prime-rl`. The -trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset -and harness options stay under `env.taskset` and `env.harness`: +trainer owns model, endpoint, sampling, and rollout count; v1-specific options +stay on the taskset or harness config that owns them: ```toml # configs/rl/my-v1-env.toml @@ -185,12 +191,12 @@ max_tokens = 4096 [[env]] id = "my-env" -[env.args] -arg1 = "non-th-arg" - [env.harness] max_turns = 1 +[env.taskset] +split = "train" + [env.taskset.scoring.contains_answer] weight = 1.0 ``` diff --git a/assets/lab/environments/AGENTS.md b/assets/lab/environments/AGENTS.md index 60f640210..a6519c8a2 100644 --- a/assets/lab/environments/AGENTS.md +++ b/assets/lab/environments/AGENTS.md @@ -693,14 +693,18 @@ environments/my_env/ └── README.md # documentation template ``` -The environment file must export a `load_environment()` function that returns a `vf.Environment`. Explicitly declare any arguments your environment accepts: +The environment file exports a taskset-first v1 loader: ```python import verifiers as vf -def load_environment(difficulty: str = "easy", num_examples: int = -1) -> vf.Environment: - # build dataset, rubric, etc. - return vf.SingleTurnEnv(dataset=dataset, rubric=rubric) + +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: + return vf.Taskset(source=source, rewards=[reward_fn], config=config) + + +def load_environment(config: vf.EnvConfig) -> vf.Env: + return vf.Env(taskset=load_taskset(config=config.taskset)) ``` ### pyproject.toml diff --git a/docs/byo-harness.md b/docs/byo-harness.md index e1bc99145..604c934aa 100644 --- a/docs/byo-harness.md +++ b/docs/byo-harness.md @@ -50,7 +50,7 @@ async def contains_answer(task, state) -> float: return float(task["answer"] in str(state.get("completion") or "")) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[contains_answer], config=config) @@ -74,8 +74,7 @@ class GSM8KTasksetConfig(vf.TasksetConfig): split: str = "train" -def load_taskset(config: vf.TasksetConfig | None = None): - config = GSM8KTasksetConfig(config) +def load_taskset(config: GSM8KTasksetConfig): dataset_name = config.dataset_name split = config.split @@ -273,7 +272,7 @@ Create a harness when rollout behavior is no longer just "call the model with the resolved taskset tools." ```python -def load_harness(config: vf.HarnessConfig | None = None): +def load_harness(config: vf.HarnessConfig): return vf.Harness( program={"fn": "my_env.program:run"}, config=config, @@ -422,8 +421,7 @@ def load_environment(config: vf.EnvConfig): ) ``` -Eval config passes named environment args through `args` and v1 config through -the `taskset`/`harness` sections: +Eval config passes v1 config through the `taskset`/`harness` sections: ```toml model = "openai/gpt-5.4-mini" @@ -441,47 +439,32 @@ max_turns = 4 weight = 0.5 ``` -For concise named args, define one typed args object and pass it as `args`. -`EnvConfig.args` is intentionally user-defined; environment packages decide how -those args flow into taskset and harness construction. +For environment-specific settings, define leaf fields on the taskset or harness +config that owns them. An `EnvConfig` subclass only fixes the concrete taskset +and harness config types for the loader. ```python -class MyEnvArgsConfig(vf.Config): +class MyTasksetConfig(vf.TasksetConfig): split: str = "train" - max_turns: int = 10 -class MyTasksetConfig(vf.TasksetConfig): - split: str = "train" +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: vf.HarnessConfig -def load_taskset(config: vf.TasksetConfig | None = None): - config = MyTasksetConfig(config) +def load_taskset(config: MyTasksetConfig): ... -def load_harness(config: vf.HarnessConfig | None = None): - config = vf.HarnessConfig(config) +def load_harness(config: vf.HarnessConfig): ... -def load_environment( - config: vf.EnvConfig, - split: str = "train", - max_turns: int = 10, -): - config = vf.EnvConfig( - config, - args=MyEnvArgsConfig(split=split, max_turns=max_turns), - ) - args = MyEnvArgsConfig(config.args) +def load_environment(config: MyEnvConfig): return vf.Env( - taskset=load_taskset( - config=MyTasksetConfig(config.taskset, split=args.split) - ), - harness=load_harness( - config=vf.HarnessConfig(config.harness, max_turns=args.max_turns) - ), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) ``` @@ -499,12 +482,12 @@ max_tokens = 4096 [[env]] id = "primeintellect/my-v1-env" -[env.args] -arg1 = "non-th-arg" - [env.harness] max_turns = 8 +[env.taskset] +split = "train" + [env.taskset.toolsets.search] tools = ["my_env.tools:search"] bindings = { "search.index" = "objects.index" } diff --git a/docs/development.md b/docs/development.md index 32f41ca28..8c3637108 100644 --- a/docs/development.md +++ b/docs/development.md @@ -257,6 +257,9 @@ uv run pytest tests/test_file.py::test_name -vvs --pdb # Initialize template prime env init my-environment +# Include an explicit harness loader when needed +prime env init my-environment --with-harness + # Install locally for testing prime env install my-environment @@ -317,7 +320,7 @@ uv run ruff format --check verifiers tests # Verify Python formatting uv run ty check verifiers # Type check (matches CI Ty target) # Environment tools -prime env init new-env # Create environment +prime env init new-env # Create taskset-first v1 environment prime env install new-env # Install environment prime eval run new-env -m openai/gpt-4.1-mini -n 5 # Test environment prime eval tui # Browse evals in the tree browser diff --git a/docs/environments.md b/docs/environments.md index 7b1e3a9d5..42c45c9be 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -687,14 +687,18 @@ environments/my_env/ └── README.md # documentation template ``` -The environment file must export a `load_environment()` function that returns a `vf.Environment`. Explicitly declare any arguments your environment accepts: +The environment file exports a taskset-first v1 loader: ```python import verifiers as vf -def load_environment(difficulty: str = "easy", num_examples: int = -1) -> vf.Environment: - # build dataset, rubric, etc. - return vf.SingleTurnEnv(dataset=dataset, rubric=rubric) + +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: + return vf.Taskset(source=source, rewards=[reward_fn], config=config) + + +def load_environment(config: vf.EnvConfig) -> vf.Env: + return vf.Env(taskset=load_taskset(config=config.taskset)) ``` ### pyproject.toml diff --git a/docs/evaluation.md b/docs/evaluation.md index 94e54394b..0309e7eeb 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -68,7 +68,8 @@ The positional argument accepts two formats: Environment IDs are converted to Python module names (`my-env` → `my_env`) and imported. Modules must be installed (via `prime env install` or `uv pip install`). -The `--env-args` flag passes arguments to your `load_environment()` function: +For legacy or direct-constructor environments, the `--env-args` flag passes +arguments to your `load_environment()` function: ```bash prime eval run my-env -a '{"difficulty": "hard", "num_examples": 100}' @@ -371,15 +372,15 @@ optional: |-------|------|-------------| | `id` | string | **Required.** Environment module name | | `args` | table | Arguments passed to `load_environment()` | -| `taskset` | table | v1 taskset config passed through `config.taskset` | -| `harness` | table | v1 harness config passed through `config.harness` | +| `taskset` | table | v1 taskset config passed through `EnvConfig.taskset` | +| `harness` | table | v1 harness config passed through `EnvConfig.harness` | | `num_examples` | integer | Number of dataset examples to evaluate | | `rollouts_per_example` | integer | Rollouts per example | | `extra_env_kwargs` | table | Arguments passed to environment constructor | | `model` | string | Model to evaluate | | `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) | -Example with environment args: +Example with legacy environment args: ```toml [[eval]] diff --git a/docs/overview.md b/docs/overview.md index 50f9ed62e..308a6ee11 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -52,6 +52,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali ```bash prime env init my-env # creates a new template in ./environments/my_env ``` +Add an explicit harness loader when the environment owns harness behavior: +```bash +prime env init my-env --with-harness +``` This will create a new module called `my_env` with a basic environment template. ``` @@ -61,7 +65,9 @@ environments/my_env/ └── README.md # Documentation ``` -Environment modules should expose a `load_environment` function which returns an instance of the Environment object, and which can accept custom arguments. For example: +Environment modules should expose a `load_environment` function which returns an +environment object. For simple legacy environments, this can still be a direct +constructor: ```python # my_env.py import verifiers as vf @@ -93,7 +99,7 @@ def source(): async def contains_answer(task, state) -> float: return float(task["answer"] in str(state.get("completion") or "")) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[contains_answer], config=config) def load_environment(config: vf.EnvConfig) -> vf.Env: diff --git a/docs/reference.md b/docs/reference.md index 8b9697607..13ab011e6 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1009,9 +1009,8 @@ class Config(BaseModel): ) -> Self: ... class EnvConfig(Config): - args: object | None = None - taskset: object | None = None - harness: object | None = None + taskset: TasksetConfig + harness: HarnessConfig class TasksetConfig(Config): taskset_id: str | None = None @@ -1030,8 +1029,13 @@ class HarnessConfig(Config): ``` `EnvConfig` is the typed v1 loader envelope. TOML `[env.taskset]` and -`[env.harness]` sections flow to `config.taskset` and `config.harness`; -environment-specific named args flow through `[env.args]`. +`[env.harness]` sections populate `EnvConfig.taskset` and `EnvConfig.harness`. +Environment-specific fields belong on the taskset or harness config that owns +them; `EnvConfig` subclasses only bind concrete child config types. +`taskset` must be typed as a `TasksetConfig` subclass, and `harness` must be +typed as a `HarnessConfig` subclass. +Annotation-only `Config` fields on `Config` subclasses default to their config +class, so nested config objects do not need `Field(default_factory=...)`. `Config` subclasses accept a positional source config plus direct keyword overrides. The source object is positional-only so subclasses can define a real diff --git a/docs/training.md b/docs/training.md index d017d50fd..8420b95ba 100644 --- a/docs/training.md +++ b/docs/training.md @@ -90,12 +90,12 @@ max_tokens = 4096 [[env]] id = "primeintellect/my-v1-env" -[env.args] -arg1 = "non-th-arg" - [env.harness] max_turns = 8 +[env.taskset] +split = "train" + [env.taskset.toolsets.search] tools = ["my_env.tools:search"] bindings = { "search.index" = "objects.index" } diff --git a/environments/AGENTS.md b/environments/AGENTS.md index 35b3393a6..8bb0c865b 100644 --- a/environments/AGENTS.md +++ b/environments/AGENTS.md @@ -693,14 +693,18 @@ environments/my_env/ └── README.md # documentation template ``` -The environment file must export a `load_environment()` function that returns a `vf.Environment`. Explicitly declare any arguments your environment accepts: +The environment file exports a taskset-first v1 loader: ```python import verifiers as vf -def load_environment(difficulty: str = "easy", num_examples: int = -1) -> vf.Environment: - # build dataset, rubric, etc. - return vf.SingleTurnEnv(dataset=dataset, rubric=rubric) + +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: + return vf.Taskset(source=source, rewards=[reward_fn], config=config) + + +def load_environment(config: vf.EnvConfig) -> vf.Env: + return vf.Env(taskset=load_taskset(config=config.taskset)) ``` ### pyproject.toml diff --git a/environments/bfcl_v3/bfcl_v3.py b/environments/bfcl_v3/bfcl_v3.py index da07de2cf..eb012796a 100644 --- a/environments/bfcl_v3/bfcl_v3.py +++ b/environments/bfcl_v3/bfcl_v3.py @@ -27,6 +27,7 @@ class BFCLTasksetConfig(vf.TasksetConfig): test_category: str = "simple_python" + test_categories: list[str] | None = None examples_per_category: int = -1 @@ -34,6 +35,11 @@ class BFCLHarnessConfig(vf.HarnessConfig): test_category: str = "simple_python" +class BFCLEnvConfig(vf.EnvConfig): + taskset: BFCLTasksetConfig + harness: BFCLHarnessConfig + + def modded_convert_func_name(function_name: str, model_name: str) -> str: _ = model_name return re.sub(r"\.", "_", function_name) @@ -567,23 +573,14 @@ def sync_completion() -> list[vf.ConfigData]: class BFCLMultiTurnHarness(vf.Harness): - def __init__(self, config: vf.HarnessConfig | None = None): + def __init__(self, config: BFCLHarnessConfig): super().__init__(program=self.run_bfcl_multi_turn, config=config) async def run_bfcl_multi_turn(self, task: vf.Task, state: vf.State) -> vf.State: return await bfcl_multi_turn_program(task, state, self) -def load_taskset( - test_category: str | None = None, - examples_per_category: int | None = None, - config: vf.TasksetConfig | None = None, -) -> vf.Taskset: - config = BFCLTasksetConfig( - config, - test_category=test_category, - examples_per_category=examples_per_category, - ) +def load_taskset(config: BFCLTasksetConfig) -> vf.Taskset: return vf.Taskset( source=build_source(config.test_category, config.examples_per_category), rewards=[bfcl_reward], @@ -591,11 +588,7 @@ def load_taskset( ) -def load_harness( - test_category: str | None = None, - config: vf.HarnessConfig | None = None, -) -> vf.Harness: - config = BFCLHarnessConfig(config, test_category=test_category) +def load_harness(config: BFCLHarnessConfig) -> vf.Harness: patch_bfcl_eval() from bfcl_eval.utils import is_multi_turn @@ -604,30 +597,22 @@ def load_harness( return vf.Harness(config=config) -def load_environment( - config: vf.EnvConfig, - *, - test_category: str = "simple_python", - test_categories: list[str] | None = None, - examples_per_category: int = -1, -) -> vf.Env | vf.EnvGroup: - categories = [test_category] if test_categories is None else test_categories +def load_environment(config: BFCLEnvConfig) -> vf.Env | vf.EnvGroup: + base_taskset_config = config.taskset + base_harness_config = config.harness + categories = base_taskset_config.test_categories or [ + base_taskset_config.test_category + ] envs: list[vf.Env] = [] for category in categories: - category_config = vf.EnvConfig( - config, - taskset=BFCLTasksetConfig( - test_category=category, - examples_per_category=examples_per_category, - ), - harness=BFCLHarnessConfig(test_category=category), - ) + taskset_config = BFCLTasksetConfig(base_taskset_config, test_category=category) + harness_config = BFCLHarnessConfig(base_harness_config, test_category=category) envs.append( vf.Env( - taskset=load_taskset(config=category_config.taskset), - harness=load_harness(config=category_config.harness), + taskset=load_taskset(config=taskset_config), + harness=load_harness(config=harness_config), ) ) - if test_categories is not None: + if base_taskset_config.test_categories is not None: return vf.EnvGroup(envs=envs, env_names=categories) return envs[0] diff --git a/environments/dspy_flights/README.md b/environments/dspy_flights/README.md index 1183f5ba0..15da9832d 100644 --- a/environments/dspy_flights/README.md +++ b/environments/dspy_flights/README.md @@ -2,6 +2,7 @@ Minimal v1 environment for a third-party DSPy flight-support program. -`load_harness()` uses a sandboxed Python `program.fn` entrypoint. v1 resolves -this package from `pyproject.toml`, installs it in the program sandbox, and then -runs `dspy_flights:run_dspy_flight_program` with normal package dependencies. +`load_harness(config)` uses a sandboxed Python `program.fn` entrypoint. v1 +resolves this package from `pyproject.toml`, installs it in the program sandbox, +and then runs `dspy_flights:run_dspy_flight_program` with normal package +dependencies. diff --git a/environments/dspy_flights/dspy_flights.py b/environments/dspy_flights/dspy_flights.py index 9d811a785..3327c473a 100644 --- a/environments/dspy_flights/dspy_flights.py +++ b/environments/dspy_flights/dspy_flights.py @@ -414,7 +414,7 @@ def stringify_nested(value: object) -> object: return str(value) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset( source=source, rewards=[expected_database_change], @@ -423,7 +423,7 @@ def load_taskset(config: vf.TasksetConfig | None = None): ) -def load_harness(config: vf.HarnessConfig | None = None): +def load_harness(config: vf.HarnessConfig): return vf.Harness( program={"fn": "dspy_flights:run_dspy_flight_program", "sandbox": True}, sandbox=PROGRAM_SANDBOX, diff --git a/environments/dspy_rlm/README.md b/environments/dspy_rlm/README.md index aad6ac5d7..e03412805 100644 --- a/environments/dspy_rlm/README.md +++ b/environments/dspy_rlm/README.md @@ -14,7 +14,7 @@ - **Primary dataset(s)**: `gsm8k` train (train) and test (eval) via `load_example_dataset` - **Source links**: Uses the example loader in `verifiers.utils.data_utils` -- **Split sizes**: Configurable via args; defaults to 50 train / 20 eval +- **Split sizes**: Configurable via taskset config; defaults to 50 train / 20 eval ### Task @@ -40,13 +40,12 @@ Configure model and sampling: ```bash prime eval run dspy-rlm \ -m gpt-4.1-mini \ - -n 10 -r 3 -t 1024 -T 0.7 \ - -a '{"num_train_examples": 50, "num_eval_examples": 20}' + -n 10 -r 3 -t 1024 -T 0.7 ``` -### Environment Arguments +### Taskset Config -| Arg | Type | Default | Description | +| Field | Type | Default | Description | | -------------------- | ----- | ------- | ------------------------------ | | `num_train_examples` | int | `50` | Number of training examples | | `num_eval_examples` | int | `20` | Number of evaluation examples | diff --git a/environments/dspy_rlm/dspy_rlm.py b/environments/dspy_rlm/dspy_rlm.py index 165717a00..55298e402 100644 --- a/environments/dspy_rlm/dspy_rlm.py +++ b/environments/dspy_rlm/dspy_rlm.py @@ -4,6 +4,16 @@ from verifiers.utils.data_utils import load_example_dataset +class DSPYRLMTasksetConfig(vf.TasksetConfig): + num_train_examples: int = 50 + num_eval_examples: int = 20 + + +class DSPYRLMEnvConfig(vf.EnvConfig): + taskset: DSPYRLMTasksetConfig + harness: vf.HarnessConfig + + async def run_dspy_rlm_program(task: vf.Task, state: vf.State) -> vf.State: import dspy @@ -84,35 +94,23 @@ def answer_reward(task: vf.Task, state: vf.State) -> float: return answers_match(agent_answer, str(task.get("answer", ""))) -def load_taskset( - num_train_examples: int = 50, - num_eval_examples: int = 20, - config: vf.TasksetConfig | None = None, -) -> vf.Taskset: +def load_taskset(config: DSPYRLMTasksetConfig) -> vf.Taskset: return vf.Taskset( - source=lambda: load_rows("train", num_train_examples), - eval_source=lambda: load_rows("test", num_eval_examples), + source=lambda: load_rows("train", config.num_train_examples), + eval_source=lambda: load_rows("test", config.num_eval_examples), taskset_id="gsm8k-dspy-rlm", rewards=[answer_reward], config=config, ) -def load_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: +def load_harness(config: vf.HarnessConfig) -> vf.Harness: return vf.Harness(program=run_dspy_rlm_program, config=config) -def load_environment( - config: vf.EnvConfig, - num_train_examples: int = 50, - num_eval_examples: int = 20, -) -> vf.Env: +def load_environment(config: DSPYRLMEnvConfig) -> vf.Env: """Load the DSPy RLM V1 taskset/harness example environment.""" return vf.Env( - taskset=load_taskset( - num_train_examples=num_train_examples, - num_eval_examples=num_eval_examples, - config=config.taskset, - ), - harness=load_harness(config.harness), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) diff --git a/environments/hello_group_reward_v1/hello_group_reward_v1.py b/environments/hello_group_reward_v1/hello_group_reward_v1.py index 3a5d5e9de..853073b8a 100644 --- a/environments/hello_group_reward_v1/hello_group_reward_v1.py +++ b/environments/hello_group_reward_v1/hello_group_reward_v1.py @@ -19,6 +19,11 @@ class GroupRewardHarnessConfig(vf.HarnessConfig): max_turns: int = 1 +class GroupRewardEnvConfig(vf.EnvConfig): + taskset: GroupRewardTasksetConfig + harness: GroupRewardHarnessConfig + + def group_reward_task( task_id: str, question: str, @@ -303,12 +308,7 @@ def source(num_examples: int = -1): } -def load_taskset( - num_examples: int | None = None, - config: vf.TasksetConfig | None = None, -) -> GroupRewardTaskset: - config = GroupRewardTasksetConfig(config, num_examples=num_examples) - +def load_taskset(config: GroupRewardTasksetConfig) -> GroupRewardTaskset: def load_rows(): return source(num_examples=config.num_examples) @@ -324,11 +324,7 @@ def load_rows(): ) -def load_harness( - max_turns: int | None = None, - config: vf.HarnessConfig | None = None, -) -> vf.Harness: - config = GroupRewardHarnessConfig(config, max_turns=max_turns) +def load_harness(config: GroupRewardHarnessConfig) -> vf.Harness: return vf.Harness( program=candidate_program, max_turns=config.max_turns, @@ -336,24 +332,8 @@ def load_harness( ) -def load_environment( - num_examples: int = -1, - *, - config: vf.EnvConfig, -) -> vf.Env: - config = vf.EnvConfig( - config, - taskset=GroupRewardTasksetConfig(num_examples=num_examples), - ) +def load_environment(config: GroupRewardEnvConfig) -> vf.Env: return vf.Env( taskset=load_taskset(config=config.taskset), harness=load_harness(config=config.harness), ) - - -def load_v1_environment( - num_examples: int = -1, - *, - config: vf.EnvConfig, -) -> vf.Env: - return load_environment(num_examples=num_examples, config=config) diff --git a/environments/hello_parallel_sandbox_v1/hello_parallel_sandbox_v1.py b/environments/hello_parallel_sandbox_v1/hello_parallel_sandbox_v1.py index 160e87539..c37cd84b0 100644 --- a/environments/hello_parallel_sandbox_v1/hello_parallel_sandbox_v1.py +++ b/environments/hello_parallel_sandbox_v1/hello_parallel_sandbox_v1.py @@ -139,6 +139,11 @@ class ParallelSandboxHarnessConfig(vf.HarnessConfig): max_turns: int = 4 +class ParallelSandboxEnvConfig(vf.EnvConfig): + taskset: ParallelSandboxTasksetConfig + harness: ParallelSandboxHarnessConfig + + async def bash(command: str, sandbox, state) -> str: """Run a bash command in the active program sandbox.""" result = await sandbox.execute(command, timeout=120, working_dir="/tmp") @@ -339,12 +344,7 @@ def source(num_examples: int = -1): } -def load_taskset( - num_examples: int | None = None, - config: vf.TasksetConfig | None = None, -) -> vf.Taskset: - config = ParallelSandboxTasksetConfig(config, num_examples=num_examples) - +def load_taskset(config: ParallelSandboxTasksetConfig) -> vf.Taskset: def load_rows(): return source(num_examples=config.num_examples) @@ -360,11 +360,7 @@ def load_rows(): ) -def load_harness( - max_turns: int | None = None, - config: vf.HarnessConfig | None = None, -) -> vf.Harness: - config = ParallelSandboxHarnessConfig(config, max_turns=max_turns) +def load_harness(config: ParallelSandboxHarnessConfig) -> vf.Harness: return vf.Harness( program={"sandbox": True, "channels": "callable"}, sandbox=PROGRAM_SANDBOX, @@ -373,31 +369,8 @@ def load_harness( ) -def load_environment( - num_examples: int = -1, - max_turns: int = 4, - *, - config: vf.EnvConfig, -) -> vf.Env: - config = vf.EnvConfig( - config, - taskset=ParallelSandboxTasksetConfig(num_examples=num_examples), - harness=ParallelSandboxHarnessConfig(max_turns=max_turns), - ) +def load_environment(config: ParallelSandboxEnvConfig) -> vf.Env: return vf.Env( taskset=load_taskset(config=config.taskset), harness=load_harness(config=config.harness), ) - - -def load_v1_environment( - num_examples: int = -1, - max_turns: int = 4, - *, - config: vf.EnvConfig, -) -> vf.Env: - return load_environment( - num_examples=num_examples, - max_turns=max_turns, - config=config, - ) diff --git a/environments/hello_rlm_v1/hello_rlm_v1.py b/environments/hello_rlm_v1/hello_rlm_v1.py index b0f599881..bff3b55a7 100644 --- a/environments/hello_rlm_v1/hello_rlm_v1.py +++ b/environments/hello_rlm_v1/hello_rlm_v1.py @@ -52,7 +52,7 @@ def source(): ] -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset( source=source, rewards=[exact_answer], @@ -60,13 +60,17 @@ def load_taskset(config: vf.TasksetConfig | None = None): ) -def load_harness(config: vf.RLMConfig | None = None): +def load_harness(config: vf.RLMConfig): return vf.RLM(config=config) -def load_environment(config: vf.EnvConfig): - harness_config = None if config.harness is None else vf.RLMConfig(config.harness) +class HelloRLMEnvConfig(vf.EnvConfig): + taskset: vf.TasksetConfig + harness: vf.RLMConfig + + +def load_environment(config: HelloRLMEnvConfig): return vf.Env( taskset=load_taskset(config=config.taskset), - harness=load_harness(config=harness_config), + harness=load_harness(config=config.harness), ) diff --git a/environments/hello_self_judge_v1/hello_self_judge_v1.py b/environments/hello_self_judge_v1/hello_self_judge_v1.py index c710c4a61..60564ae0a 100644 --- a/environments/hello_self_judge_v1/hello_self_judge_v1.py +++ b/environments/hello_self_judge_v1/hello_self_judge_v1.py @@ -160,6 +160,11 @@ class SelfJudgeHarnessConfig(vf.HarnessConfig): max_turns: int = 8 +class SelfJudgeEnvConfig(vf.EnvConfig): + taskset: SelfJudgeTasksetConfig + harness: SelfJudgeHarnessConfig + + async def bash(command: str, sandbox, state) -> str: """Run a bash command in the rollout sandbox and return stdout/stderr.""" result = await sandbox.execute(command, timeout=120, working_dir="/tmp") @@ -341,11 +346,8 @@ def load_bash_toolset(config=None) -> vf.Toolset: def load_taskset( - num_examples: int | None = None, - config: vf.TasksetConfig | None = None, + config: SelfJudgeTasksetConfig, ) -> vf.Taskset: - config = SelfJudgeTasksetConfig(config, num_examples=num_examples) - def load_rows(): return source(num_examples=config.num_examples) @@ -361,38 +363,15 @@ def load_rows(): def load_harness( - max_turns: int | None = None, - config: vf.HarnessConfig | None = None, + config: SelfJudgeHarnessConfig, ) -> vf.Harness: - config = SelfJudgeHarnessConfig(config, max_turns=max_turns) return vf.Harness(max_turns=config.max_turns, config=config) def load_environment( - num_examples: int = -1, - max_turns: int = 8, - *, - config: vf.EnvConfig, + config: SelfJudgeEnvConfig, ) -> vf.Env: - config = vf.EnvConfig( - config, - taskset=SelfJudgeTasksetConfig(num_examples=num_examples), - harness=SelfJudgeHarnessConfig(max_turns=max_turns), - ) return vf.Env( taskset=load_taskset(config=config.taskset), harness=load_harness(config=config.harness), ) - - -def load_v1_environment( - num_examples: int = -1, - max_turns: int = 8, - *, - config: vf.EnvConfig, -) -> vf.Env: - return load_environment( - num_examples=num_examples, - max_turns=max_turns, - config=config, - ) diff --git a/environments/hello_subagent_v1/hello_subagent_v1.py b/environments/hello_subagent_v1/hello_subagent_v1.py index a5076530a..493582f31 100644 --- a/environments/hello_subagent_v1/hello_subagent_v1.py +++ b/environments/hello_subagent_v1/hello_subagent_v1.py @@ -73,7 +73,7 @@ def load_toolset(): ) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset( source=source, system_prompt=( @@ -86,7 +86,7 @@ def load_taskset(config: vf.TasksetConfig | None = None): ) -def load_harness(config: vf.HarnessConfig | None = None): +def load_harness(config: vf.HarnessConfig): return vf.Harness( toolsets=[load_toolset()], metrics=[subagent_calls], diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md index 2b3f83045..7fbec5fce 100644 --- a/environments/langchain_deep_agents_wikispeedia/README.md +++ b/environments/langchain_deep_agents_wikispeedia/README.md @@ -35,23 +35,23 @@ Configure model and difficulty band: prime eval run langchain-deep-agents-wikispeedia \ -m openai/gpt-4.1-mini \ -n 20 -r 3 -t 4096 -T 0.7 \ - -a '{"min_path_length": 4, "max_path_length": 6, "max_turns": 40}' + -a '{"config": {"taskset": {"min_path_length": 4, "max_path_length": 6, "max_turns": 40}}}' ``` Disable `go_back` (force planning over backtracking): ```bash prime eval run langchain-deep-agents-wikispeedia \ -m openai/gpt-4.1-mini -n 20 -r 3 \ - -a '{"allow_go_back": false}' + -a '{"config": {"taskset": {"allow_go_back": false}}}' ``` Notes: - The first run downloads ~5MB of SNAP data into `~/.cache/wikispeedia` (override with `cache_dir`). - Set `OPENAI_API_KEY` (or whatever the policy endpoint expects) for the agent. -### Environment Arguments +### Taskset Config -| Arg | Type | Default | Description | +| Field | Type | Default | Description | | --- | ---- | ------- | ----------- | | `cache_dir` | str \| None | `None` | SNAP cache directory (defaults to `~/.cache/wikispeedia`). | | `min_path_length` | int | `3` | Drop pairs with shortest path shorter than this. | @@ -63,10 +63,16 @@ Notes: | `links_only` | bool | `False` | Render articles as just the link menu (ablation: tests whether the agent navigates from semantic content or link names alone). | | `allow_go_back` | bool | `True` | Expose the `go_back` tool. | | `max_turns` | int | `50` | Per-rollout turn cap. | -| `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. | | `efficiency_weight` | float | `0.0` | If `> 0`, mix `path_efficiency` into the reward at this weight (a near-optimal route earns up to `1 + efficiency_weight`; a wanderer that reaches the target still earns `1`). Default `0.0` keeps reward as pure binary reachability. | | `stratify_path_length` | bool | `True` | Take equal counts at each shortest-path bucket inside `[min_path_length, max_path_length]`, capped at the smallest non-empty bucket. The SNAP graph's natural distribution heavily skews toward the lower end of any band (4-6 → 83% sp=4); without stratification the policy over-trains on the trivial floor. Set `False` to recover the natural distribution. | +### Harness Config + +| Field | Type | Default | Description | +| --- | ---- | ------- | ----------- | +| `max_turns` | int | `50` | LangChain recursion limit fallback when runtime config does not provide one. | +| `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. | + ### Metrics | Metric | Meaning | | ------ | ------- | diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py index 3aa57e9c3..96ab096ed 100644 --- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py +++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py @@ -48,6 +48,39 @@ def system_prompt(allow_go_back: bool = True) -> str: SYSTEM_PROMPT = system_prompt() +class WikispeediaTasksetConfig(vf.TasksetConfig): + cache_dir: str | None = None + min_path_length: int = 3 + max_path_length: int = 6 + train_size: int = 50_000 + eval_size: int = 1_000 + eval_target_fraction: float = 0.1 + split_seed: int = 0 + links_only: bool = False + allow_go_back: bool = True + max_turns: int = 50 + efficiency_weight: float = 0.0 + stratify_path_length: bool = True + + +class WikispeediaHarnessConfig(vf.HarnessConfig): + max_turns: int = 50 + timeout_seconds: float = 1200.0 + + +class WikispeediaEnvConfig(vf.EnvConfig): + taskset: WikispeediaTasksetConfig + harness: WikispeediaHarnessConfig + + +class WikispeediaTaskset(vf.Taskset): + config_type = WikispeediaTasksetConfig + + +class WikispeediaHarness(vf.Harness): + config_type = WikispeediaHarnessConfig + + def format_article(wiki: WikiGraph, article: str, links_only: bool = False) -> str: links = wiki.get_links(article) links_str = ", ".join(links) if links else "(no outgoing links)" @@ -423,52 +456,38 @@ async def run_langchain_deep_agents_wikispeedia_program( return run_langchain_deep_agents_wikispeedia_program -def load_taskset( - cache_dir: str | None = None, - min_path_length: int = 3, - max_path_length: int = 6, - train_size: int = 50_000, - eval_size: int = 1_000, - eval_target_fraction: float = 0.1, - split_seed: int = 0, - links_only: bool = False, - allow_go_back: bool = True, - max_turns: int = 50, - efficiency_weight: float = 0.0, - stratify_path_length: bool = True, - config: vf.TasksetConfig | None = None, -) -> vf.Taskset: +def load_taskset(config: WikispeediaTasksetConfig) -> WikispeediaTaskset: pair_cache: dict[str, tuple[list[WikiPair], list[WikiPair]]] = {} def pairs() -> tuple[list[WikiPair], list[WikiPair]]: if "pairs" not in pair_cache: - pair_cache["pairs"] = load_wiki_graph(cache_dir).split_pairs( - train_size=train_size, - eval_size=eval_size, - min_dist=min_path_length, - max_dist=max_path_length, - eval_target_fraction=eval_target_fraction, - seed=split_seed, - stratify=stratify_path_length, + pair_cache["pairs"] = load_wiki_graph(config.cache_dir).split_pairs( + train_size=config.train_size, + eval_size=config.eval_size, + min_dist=config.min_path_length, + max_dist=config.max_path_length, + eval_target_fraction=config.eval_target_fraction, + seed=config.split_seed, + stratify=config.stratify_path_length, ) return pair_cache["pairs"] def build_train() -> Dataset: train, _ = pairs() return build_dataset( - load_wiki_graph(cache_dir), + load_wiki_graph(config.cache_dir), train, - links_only=links_only, - max_turns=max_turns, + links_only=config.links_only, + max_turns=config.max_turns, ) def build_eval() -> Dataset: _, eval_ = pairs() return build_dataset( - load_wiki_graph(cache_dir), + load_wiki_graph(config.cache_dir), eval_, - links_only=links_only, - max_turns=max_turns, + links_only=config.links_only, + max_turns=config.max_turns, ) rewards = [reached_target] @@ -484,81 +503,50 @@ def build_eval() -> Dataset: for name in sorted(DEEP_AGENT_TOOLS | WIKISPEEDIA_TOOLS) ], ] - if efficiency_weight > 0: + if config.efficiency_weight > 0: async def weighted_path_efficiency(task: vf.Task, state: vf.State) -> float: return await path_efficiency(task, state) weighted_path_efficiency.__name__ = "path_efficiency" - rewards.append(vf.reward(weight=efficiency_weight)(weighted_path_efficiency)) + rewards.append( + vf.reward(weight=config.efficiency_weight)(weighted_path_efficiency) + ) else: metrics.insert(0, path_efficiency) - return vf.Taskset( + return WikispeediaTaskset( source=build_train, eval_source=build_eval, taskset_id="langchain-deep-agents-wikispeedia", - system_prompt=system_prompt(allow_go_back=allow_go_back), - toolsets=[load_toolset(cache_dir=cache_dir, allow_go_back=allow_go_back)], + system_prompt=system_prompt(allow_go_back=config.allow_go_back), + toolsets=[ + load_toolset( + cache_dir=config.cache_dir, + allow_go_back=config.allow_go_back, + ) + ], rewards=rewards, metrics=metrics, config=config, ) -def load_harness( - max_turns: int = 50, - timeout_seconds: float = 1200.0, - config: vf.HarnessConfig | None = None, -) -> vf.Harness: - return vf.Harness( +def load_harness(config: WikispeediaHarnessConfig) -> WikispeediaHarness: + return WikispeediaHarness( program=make_langchain_deep_agents_program( - max_turns=max_turns, - timeout_seconds=timeout_seconds, + max_turns=config.max_turns, + timeout_seconds=config.timeout_seconds, ), - max_turns=max_turns, updates=[restore_agent_completion], config=config, ) -def load_environment( - config: vf.EnvConfig, - cache_dir: str | None = None, - min_path_length: int = 3, - max_path_length: int = 6, - train_size: int = 50_000, - eval_size: int = 1_000, - eval_target_fraction: float = 0.1, - split_seed: int = 0, - links_only: bool = False, - allow_go_back: bool = True, - max_turns: int = 50, - timeout_seconds: float = 1200.0, - efficiency_weight: float = 0.0, - stratify_path_length: bool = True, -) -> vf.Env: +def load_environment(config: WikispeediaEnvConfig) -> vf.Env: """Load the v1 Wikispeedia taskset with a LangChain Deep Agents harness.""" return vf.Env( - taskset=load_taskset( - cache_dir=cache_dir, - min_path_length=min_path_length, - max_path_length=max_path_length, - train_size=train_size, - eval_size=eval_size, - eval_target_fraction=eval_target_fraction, - split_seed=split_seed, - links_only=links_only, - allow_go_back=allow_go_back, - max_turns=max_turns, - efficiency_weight=efficiency_weight, - stratify_path_length=stratify_path_length, - config=config.taskset, - ), - harness=load_harness( - max_turns=max_turns, - timeout_seconds=timeout_seconds, - config=config.harness, - ), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) diff --git a/environments/mcp_search_env/mcp_search_env.py b/environments/mcp_search_env/mcp_search_env.py index c800dbb9f..a15ab78ae 100644 --- a/environments/mcp_search_env/mcp_search_env.py +++ b/environments/mcp_search_env/mcp_search_env.py @@ -146,7 +146,7 @@ def load_toolset( def load_taskset( - config: MCPSearchTasksetConfig | None = None, + config: MCPSearchTasksetConfig, dataset: Iterable[vf.ConfigMap] | None = None, mcp_servers: Iterable[vf.ConfigMap] | None = None, max_turns: int | None = None, @@ -169,35 +169,17 @@ def load_taskset( ) -def load_harness(config: vf.HarnessConfig | None = None): +def load_harness(config: vf.HarnessConfig): return vf.Harness(config=config) -def load_environment( - config: vf.EnvConfig, - dataset: Iterable[vf.ConfigMap] | None = None, - mcp_servers: Iterable[vf.ConfigMap] | None = None, - max_turns: int | None = None, -) -> vf.Env: - taskset_overrides: vf.ConfigData = {} - if mcp_servers is not None: - taskset_overrides["mcp_servers"] = [dict(server) for server in mcp_servers] - if max_turns is not None: - taskset_overrides["max_turns"] = max_turns - config = vf.EnvConfig( - config, - taskset=MCPSearchTasksetConfig(**taskset_overrides), - ) - taskset_config = ( - None if config.taskset is None else MCPSearchTasksetConfig(config.taskset) - ) - harness_config = ( - None if config.harness is None else vf.HarnessConfig(config.harness) - ) +class MCPSearchEnvConfig(vf.EnvConfig): + taskset: MCPSearchTasksetConfig + harness: vf.HarnessConfig + + +def load_environment(config: MCPSearchEnvConfig) -> vf.Env: return vf.Env( - taskset=load_taskset( - config=taskset_config, - dataset=dataset, - ), - harness=load_harness(config=harness_config), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) diff --git a/environments/nested_harness_v1/nested_harness_v1.py b/environments/nested_harness_v1/nested_harness_v1.py index 2cb013ba5..b57657e68 100644 --- a/environments/nested_harness_v1/nested_harness_v1.py +++ b/environments/nested_harness_v1/nested_harness_v1.py @@ -82,7 +82,7 @@ async def parent_program(task, state): return state -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset( source=source, rewards=[exact_answer], @@ -90,8 +90,7 @@ def load_taskset(config: vf.TasksetConfig | None = None): ) -def load_harness(config: NestedHarnessConfig | None = None): - config = NestedHarnessConfig(config) +def load_harness(config: NestedHarnessConfig): return vf.Harness( program=parent_program, toolsets=[load_toolset(config.toolset)], @@ -103,5 +102,5 @@ def load_harness(config: NestedHarnessConfig | None = None): def load_environment(config: vf.EnvConfig): return vf.Env( taskset=load_taskset(config=config.taskset), - harness=load_harness(config=config.harness), + harness=load_harness(config=NestedHarnessConfig(config.harness)), ) diff --git a/environments/openai_agents_env/README.md b/environments/openai_agents_env/README.md index ca8359f41..39efb3cb3 100644 --- a/environments/openai_agents_env/README.md +++ b/environments/openai_agents_env/README.md @@ -12,7 +12,7 @@ ### Datasets - **Primary dataset(s)**: `gsm8k` train (train) and test (eval) via `load_example_dataset` - **Source links**: Uses the example loader in `verifiers.utils.data_utils` -- **Split sizes**: Configurable via args; defaults to 50 train / 20 eval +- **Split sizes**: Configurable via taskset config; defaults to 50 train / 20 eval ### Task - **Type**: `vf.Env` with a GSM8K `vf.Taskset` and OpenAI Agents SDK `vf.Harness` @@ -33,12 +33,11 @@ Configure model and sampling: ```bash prime eval run openai-agents-env \ -m gpt-4.1-mini \ - -n 20 -r 3 -t 1024 -T 0.7 \ - -a '{"num_train_examples": 50, "num_eval_examples": 20}' + -n 20 -r 3 -t 1024 -T 0.7 ``` -### Environment Arguments -| Arg | Type | Default | Description | +### Taskset Config +| Field | Type | Default | Description | | --- | ---- | ------- | ----------- | | `num_train_examples` | int | `50` | Number of training examples | | `num_eval_examples` | int | `20` | Number of evaluation examples | diff --git a/environments/openai_agents_env/openai_agents_env.py b/environments/openai_agents_env/openai_agents_env.py index f9382ccfe..cc961bc50 100644 --- a/environments/openai_agents_env/openai_agents_env.py +++ b/environments/openai_agents_env/openai_agents_env.py @@ -6,6 +6,16 @@ ANSWER_RE = re.compile(r"^\s*ANSWER\s*:?\s*(.+?)\s*$", re.IGNORECASE) +class OpenAIAgentsTasksetConfig(vf.TasksetConfig): + num_train_examples: int = 50 + num_eval_examples: int = 20 + + +class OpenAIAgentsEnvConfig(vf.EnvConfig): + taskset: OpenAIAgentsTasksetConfig + harness: vf.HarnessConfig + + def calculate(expression: str) -> str: """Evaluate a math expression and return the result.""" try: @@ -103,35 +113,23 @@ def answer_reward(task: vf.Task, state: vf.State) -> float: return answers_match(agent_answer, str(task.get("answer", ""))) -def load_taskset( - num_train_examples: int = 50, - num_eval_examples: int = 20, - config: vf.TasksetConfig | None = None, -) -> vf.Taskset: +def load_taskset(config: OpenAIAgentsTasksetConfig) -> vf.Taskset: return vf.Taskset( - source=lambda: load_rows("train", num_train_examples), - eval_source=lambda: load_rows("test", num_eval_examples), + source=lambda: load_rows("train", config.num_train_examples), + eval_source=lambda: load_rows("test", config.num_eval_examples), taskset_id="gsm8k-openai-agents", rewards=[answer_reward], config=config, ) -def load_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: +def load_harness(config: vf.HarnessConfig) -> vf.Harness: return vf.Harness(program=run_openai_agents_program, config=config) -def load_environment( - config: vf.EnvConfig, - num_train_examples: int = 50, - num_eval_examples: int = 20, -) -> vf.Env: +def load_environment(config: OpenAIAgentsEnvConfig) -> vf.Env: """Load the OpenAI Agents SDK V1 taskset/harness example environment.""" return vf.Env( - taskset=load_taskset( - num_train_examples=num_train_examples, - num_eval_examples=num_eval_examples, - config=config.taskset, - ), - harness=load_harness(config.harness), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) diff --git a/environments/opencode_harbor/opencode_harbor.py b/environments/opencode_harbor/opencode_harbor.py index 7b83167d9..ca4010b22 100644 --- a/environments/opencode_harbor/opencode_harbor.py +++ b/environments/opencode_harbor/opencode_harbor.py @@ -1,8 +1,21 @@ import verifiers as vf -def load_environment(config: vf.EnvConfig) -> vf.Env: +class OpenCodeHarborEnvConfig(vf.EnvConfig): + taskset: vf.HarborTasksetConfig + harness: vf.OpenCodeConfig + + +def load_taskset(config: vf.HarborTasksetConfig) -> vf.HarborTaskset: + return vf.HarborTaskset(config=config) + + +def load_harness(config: vf.OpenCodeConfig) -> vf.OpenCode: + return vf.OpenCode(config=config) + + +def load_environment(config: OpenCodeHarborEnvConfig) -> vf.Env: return vf.Env( - taskset=vf.HarborTaskset(config=config.taskset), - harness=vf.OpenCode(config=config.harness), + taskset=load_taskset(config.taskset), + harness=load_harness(config.harness), ) diff --git a/environments/rlm_swe_v1/rlm_swe_v1.py b/environments/rlm_swe_v1/rlm_swe_v1.py index c4a9da2fb..d2d2523bc 100644 --- a/environments/rlm_swe_v1/rlm_swe_v1.py +++ b/environments/rlm_swe_v1/rlm_swe_v1.py @@ -489,13 +489,13 @@ def extract_gold_patch( def load_taskset( - config: RlmSweTasksetConfig | None = None, + config: RlmSweTasksetConfig, ) -> R2ESWETaskset: return R2ESWETaskset(config=config) def load_harness( - config: vf.RLMConfig | None = None, + config: vf.RLMConfig, taskset: R2ESWETaskset | None = None, ) -> vf.RLM: user_config = vf.RLMConfig(config) @@ -515,11 +515,12 @@ def load_harness( ) -def load_environment(config: vf.EnvConfig) -> vf.Env: - taskset_config = ( - None if config.taskset is None else RlmSweTasksetConfig(config.taskset) - ) - harness_config = None if config.harness is None else vf.RLMConfig(config.harness) - taskset = load_taskset(config=taskset_config) - harness = load_harness(config=harness_config, taskset=taskset) +class RlmSweEnvConfig(vf.EnvConfig): + taskset: RlmSweTasksetConfig + harness: vf.RLMConfig + + +def load_environment(config: RlmSweEnvConfig) -> vf.Env: + taskset = load_taskset(config=config.taskset) + harness = load_harness(config=config.harness, taskset=taskset) return vf.Env(taskset=taskset, harness=harness) diff --git a/environments/tau2_bench_v1/tau2_bench_v1.py b/environments/tau2_bench_v1/tau2_bench_v1.py index 257c80bcd..92497dbaa 100644 --- a/environments/tau2_bench_v1/tau2_bench_v1.py +++ b/environments/tau2_bench_v1/tau2_bench_v1.py @@ -746,6 +746,11 @@ def load_rows(): ) +class Tau2EnvConfig(vf.EnvConfig): + taskset: Tau2TasksetConfig + harness: vf.HarnessConfig + + def load_taskset( domain: str | None = None, *, @@ -756,7 +761,7 @@ def load_taskset( max_steps: int | None = None, max_errors: int | None = None, max_turns: int | None = None, - config: Tau2TasksetConfig | None = None, + config: Tau2TasksetConfig, ) -> Tau2Taskset: return Tau2Taskset( domain=domain, @@ -772,35 +777,7 @@ def load_taskset( def load_environment( - domain: str = "telecom", - *, - user_model: str = DEFAULT_USER_MODEL, - user_args: ConfigMap | None = None, - user_base_url: str = DEFAULT_USER_BASE_URL, - user_api_key_var: str = DEFAULT_USER_API_KEY_VAR, - max_steps: int = DEFAULT_MAX_STEPS, - max_errors: int = DEFAULT_MAX_ERRORS, - max_turns: int = DEFAULT_MAX_STEPS, - config: vf.EnvConfig, + config: Tau2EnvConfig, ) -> vf.Env: - config = vf.EnvConfig( - config, - taskset=Tau2TasksetConfig( - domain=domain, - user_model=user_model, - user_args=dict(user_args) if user_args is not None else None, - user_base_url=user_base_url, - user_api_key_var=user_api_key_var, - max_steps=max_steps, - max_errors=max_errors, - max_turns=max_turns, - ), - ) - taskset_config = ( - None if config.taskset is None else Tau2TasksetConfig(config.taskset) - ) - harness_config = ( - None if config.harness is None else vf.HarnessConfig(config.harness) - ) - taskset = load_taskset(config=taskset_config) - return vf.Env(taskset=taskset, harness=vf.Harness(config=harness_config)) + taskset = load_taskset(config=config.taskset) + return vf.Env(taskset=taskset, harness=vf.Harness(config=config.harness)) diff --git a/skills/browse-environments/SKILL.md b/skills/browse-environments/SKILL.md index eec60ee23..e45399ec8 100644 --- a/skills/browse-environments/SKILL.md +++ b/skills/browse-environments/SKILL.md @@ -61,7 +61,7 @@ prime eval run name -m openai/gpt-4.1-mini -n 5 ```bash prime env install reverse-text --from-repo ``` -4. For v1 Taskset + Harness examples, inspect the environment package for `load_environment(config: vf.EnvConfig) -> vf.Env` and the direct `Taskset`/`Harness` wiring. Do not assume `load_taskset` or `load_harness` wrappers exist unless they encode real reusable wiring. +4. For v1 Taskset + Harness examples, inspect the environment package for `load_taskset(...)`, optional `load_harness(...)`, and `load_environment(config: vf.EnvConfig) -> vf.Env`; expect a concrete `EnvConfig` subclass when the loader needs concrete child config types. ## Anti-Patterns 1. Do not recommend building from scratch if a strong ecosystem option exists. diff --git a/skills/create-environments/SKILL.md b/skills/create-environments/SKILL.md index 8b5de68ee..4f5f21289 100644 --- a/skills/create-environments/SKILL.md +++ b/skills/create-environments/SKILL.md @@ -16,6 +16,8 @@ prime env init my-env prime env install my-env prime eval run my-env -m openai/gpt-4.1-mini -n 5 ``` +Use `prime env init my-env --with-harness` when the environment owns an +explicit harness. 3. Treat `prime eval run` as the canonical eval path. It saves results automatically, so do not add `--skip-upload` unless the user explicitly requests that deviation. 4. Prefer an existing environment as a starting point when possible: ```bash @@ -43,12 +45,12 @@ prime env install math-python --from-repo - `StatefulToolEnv` for per-rollout resources. - `CliAgentEnv` for running agent binaries in sandboxes with API interception. Override `get_sandbox_resources(state)` for per-instance resources, `build_env_vars(state)` for custom env vars. - V1 `vf.Env` with `vf.Taskset`/`vf.Harness` for the current taskset/harness environment pattern that separates the task collection from the rollout runner. Use this for new taskset/harness work that needs config-driven metrics, rewards, toolsets, user functions, endpoint interception, or sandboxed Python/command programs. Framework programs should build clients from `state.get_endpoint_config(api="chat")`. -3. For v1, import `verifiers as vf` and implement `load_environment(config: vf.EnvConfig) -> vf.Env`. Treat `config` as required and typed; the loader is responsible for passing an `EnvConfig`. +3. For v1, import `verifiers as vf`, expose `load_environment(config: vf.EnvConfig) -> vf.Env`, and define a concrete `EnvConfig` subclass only when the taskset or harness uses concrete child config types. 4. For v0 environments, keep the existing `vf.Environment` patterns and preserve v0 compatibility. 5. Add `pyproject.toml` defaults in `[tool.verifiers.eval]` only when stable. ### V1 Authoring Rules -1. Keep v1 environment entrypoints tiny: `import verifiers as vf`, define `load_environment(config: vf.EnvConfig) -> vf.Env`, and wire `Taskset`/`Harness` constructors directly. +1. Keep v1 environment entrypoints tiny: `import verifiers as vf`, define `load_taskset(config: MyTasksetConfig)`, optionally define `load_harness(config: MyHarnessConfig)`, and wire them from `load_environment(config: vf.EnvConfig)` or a concrete `EnvConfig` subclass when needed. 2. Use `Taskset(objects=..., bindings=...)` for shared taskset dependencies such as extractors, clients, or format checkers. Do not introduce v1 Parser/Rubric wrappers; parsing is ordinary Python or a bound object. 3. Use `vf.get_messages(state.get("completion") or [], role="assistant")` when reading state completions. The helper returns typed message objects and should not receive `None`. 4. Use `program.channels` for v1 program protocol/channel selection. Do not use stale `program.tools` terminology. @@ -64,25 +66,27 @@ prime env install math-python --from-repo [[env]] id = "owner/my-env" -[env.args] -split = "train" - [env.taskset] num_examples = 100 +split = "train" [env.harness] max_turns = 8 ``` 6. In code, normalize config at the loader boundary and pass child configs directly: ```python -def load_environment(config: vf.EnvConfig | None = None) -> vf.Env: - config = config or vf.EnvConfig() +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: vf.HarnessConfig + + +def load_environment(config: MyEnvConfig) -> vf.Env: return vf.Env( - taskset=load_taskset(config.taskset), - harness=load_harness(config.harness), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) ``` -7. If concise env-level named args are useful, map them explicitly into `vf.EnvConfig(...)` once in `load_environment`; do not thread loose kwargs through taskset and harness internals. +7. Do not add root env config knobs. Put settings as leaf fields on the taskset or harness config that owns them. ### 2. Port From Another Library, Project, or Paper 1. Create a strict source-to-target mapping before coding: diff --git a/skills/evaluate-environments/SKILL.md b/skills/evaluate-environments/SKILL.md index dda3a61b9..a506f80bd 100644 --- a/skills/evaluate-environments/SKILL.md +++ b/skills/evaluate-environments/SKILL.md @@ -99,9 +99,9 @@ prime eval run configs/eval/my-benchmark.toml 3. Make config files the default for benchmark sweeps, multi-model comparisons, and recurring reports. ## Common Evaluation Patterns -1. Pass args to `load_environment()`: +1. Override v1 taskset and harness config through explicit child sections: ```bash -prime eval run my-env -a '{"difficulty":"hard"}' +prime eval run my-env -a '{"config":{"taskset":{"difficulty":"hard"},"harness":{"max_turns":20}}}' ``` 2. Override constructor kwargs: ```bash @@ -146,10 +146,10 @@ env_id = "my-env" [ablation.sweep] temperature = [0.0, 0.5, 1.0] -[ablation.sweep.args] +[ablation.sweep.taskset] difficulty = ["easy", "hard"] ``` -This generates the cartesian product (6 configs in this example). Use `--abbreviated-summary` (`-A`) for compact ablation results. +This generates the cartesian product (6 configs in this example). Sweep v1 environment-owned settings under `taskset` or `harness`, not as root args. Use `--abbreviated-summary` (`-A`) for compact ablation results. ## Inspect Saved Results 1. Browse locally saved runs: diff --git a/skills/optimize-with-environments/SKILL.md b/skills/optimize-with-environments/SKILL.md index 301f8f6a8..8214d5dc0 100644 --- a/skills/optimize-with-environments/SKILL.md +++ b/skills/optimize-with-environments/SKILL.md @@ -37,11 +37,22 @@ prime eval run my-env -m openai/gpt-4.1-mini -n 50 -r 3 -s ```bash prime gepa run my-env -m openai/gpt-4.1-mini -M openai/gpt-4.1-mini -B 500 -n 100 -N 50 ``` -4. Or run from config: +4. Keep v1 environment settings under `taskset` and `harness` config sections: +```toml +[[env]] +id = "my-env" + +[env.taskset] +split = "train" + +[env.harness] +max_turns = 8 +``` +5. Or run from config: ```bash prime gepa run configs/gepa/qwen-3-5.toml ``` -5. Re-evaluate with optimized prompt and compare against baseline. +6. Re-evaluate with optimized prompt and compare against baseline. ## High-Value Settings 1. `-B/--max-calls`: total optimization budget. diff --git a/skills/review-environments/SKILL.md b/skills/review-environments/SKILL.md index 4ceb85f56..5959ecd32 100644 --- a/skills/review-environments/SKILL.md +++ b/skills/review-environments/SKILL.md @@ -59,7 +59,7 @@ prime eval run -m openai/gpt-4.1-mini -n 5 ## Config And Docs Surface 1. Check that eval, GEPA, RL, and Hosted Training examples use the same public TOML shape where applicable. -2. For v1 configs, prefer `[env.args]`, `[env.taskset]`, and `[env.harness]`; loader code should normalize at the boundary instead of spreading compatibility branches through examples. +2. For v1 configs, route settings through `[env.taskset]` and `[env.harness]`; use a concrete `EnvConfig` subclass when the loader needs concrete child config types, and avoid root env config knobs. 3. If docs changed public behavior, verify the relevant bundled skill was updated too. ## Findings Format diff --git a/skills/train-with-environments/SKILL.md b/skills/train-with-environments/SKILL.md index 79d7b09b3..4311cd534 100644 --- a/skills/train-with-environments/SKILL.md +++ b/skills/train-with-environments/SKILL.md @@ -36,7 +36,7 @@ prime lab setup --prime-rl prime env install my-env prime eval run my-env -m openai/gpt-4.1-mini -n 20 -r 3 -s ``` -2. For v1 Taskset + Harness environments, verify the package exposes `load_environment(config: vf.EnvConfig) -> vf.Env`; trainers interact with the same environment boundary even when the implementation is BYO Harness internally. +2. For v1 Taskset + Harness environments, verify the package exposes `load_environment(config: vf.EnvConfig) -> vf.Env`, or a concrete `EnvConfig` subclass when the loader needs concrete child config types; trainers interact with the same environment boundary even when the implementation is BYO Harness internally. 3. Confirm reward diversity exists at baseline. 4. Start with conservative run length and inspect samples early. @@ -55,18 +55,16 @@ prime env push my-env --visibility PRIVATE ## RL TOML Environment Sections 1. Use the same environment config shape for Hosted Training and `prime-rl`. -2. Put normal `load_environment(...)` named args in `[env.args]`. -3. Put v1 taskset config in `[env.taskset]` and v1 harness config in `[env.harness]`. +2. Put task-owned v1 config in `[env.taskset]`. +3. Put harness-owned v1 config in `[env.harness]`. 4. Keep model, endpoint, sampling, rollout count, and trainer controls outside the environment sections unless configuring a nested or auxiliary harness model. ```toml [[env]] id = "owner/my-env" -[env.args] -split = "train" - [env.taskset] num_examples = 1000 +split = "train" [env.harness] max_turns = 8 diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py index bec58212f..246d1900e 100644 --- a/tests/test_langchain_deep_agents_wikispeedia.py +++ b/tests/test_langchain_deep_agents_wikispeedia.py @@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness( ) -> None: module = load_module(monkeypatch) - env = module.load_environment(config=vf.EnvConfig(), train_size=1, eval_size=1) + env = module.load_environment(config=module.WikispeediaEnvConfig()) assert isinstance(env, vf.Env) assert isinstance(env.taskset, vf.Taskset) @@ -65,6 +65,43 @@ def test_wikispeedia_loads_as_v1_taskset_harness( assert env.taskset.taskset_id == "langchain-deep-agents-wikispeedia" +def test_wikispeedia_env_config_reaches_taskset_and_harness( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = load_module(monkeypatch) + wiki = make_small_wiki(module) + monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki) + + env = module.load_environment( + config=module.WikispeediaEnvConfig( + taskset={ + "train_size": 2, + "eval_size": 1, + "min_path_length": 1, + "max_path_length": 1, + "eval_target_fraction": 0.5, + "allow_go_back": False, + "links_only": True, + "max_turns": 7, + }, + harness={ + "max_turns": 8, + "timeout_seconds": 9.0, + }, + ) + ) + + train_rows = list(env.taskset.source()) + eval_rows = list(env.taskset.eval_source()) + + assert len(train_rows) == 2 + assert len(eval_rows) == 1 + assert train_rows[0]["max_turns"] == 7 + assert env.harness.config.max_turns == 8 + assert env.harness.config.timeout_seconds == 9.0 + assert [tool.__name__ for tool in env.taskset.toolsets[0].tools] == ["click_link"] + + def test_wikispeedia_rows_use_v1_task_shape( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -90,11 +127,13 @@ def test_wikispeedia_taskset_sources_use_disjoint_target_split( wiki = make_small_wiki(module) monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki) taskset = module.load_taskset( - train_size=2, - eval_size=1, - min_path_length=1, - max_path_length=1, - eval_target_fraction=0.5, + config=module.WikispeediaTasksetConfig( + train_size=2, + eval_size=1, + min_path_length=1, + max_path_length=1, + eval_target_fraction=0.5, + ) ) train_rows = list(taskset.source()) @@ -114,8 +153,12 @@ def test_wikispeedia_efficiency_weight_uses_fresh_reward_wrapper( wiki = make_small_wiki(module) monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki) - weighted = module.load_taskset(efficiency_weight=0.5) - plain = module.load_taskset(efficiency_weight=0.0) + weighted = module.load_taskset( + config=module.WikispeediaTasksetConfig(efficiency_weight=0.5) + ) + plain = module.load_taskset( + config=module.WikispeediaTasksetConfig(efficiency_weight=0.0) + ) assert any(fn.__name__ == "path_efficiency" for fn in weighted.rewards) assert any(fn is module.path_efficiency for fn in plain.metrics) @@ -127,13 +170,17 @@ def test_wikispeedia_taskset_owns_navigation_tools( ) -> None: module = load_module(monkeypatch) - taskset = module.load_taskset(allow_go_back=True) + taskset = module.load_taskset( + config=module.WikispeediaTasksetConfig(allow_go_back=True) + ) names = [tool.__name__ for tool in taskset.toolsets[0].tools] - no_back = module.load_taskset(allow_go_back=False) + no_back = module.load_taskset( + config=module.WikispeediaTasksetConfig(allow_go_back=False) + ) assert names == ["click_link", "go_back"] assert [tool.__name__ for tool in no_back.toolsets[0].tools] == ["click_link"] - assert module.load_harness().toolsets == [] + assert module.load_harness(config=module.WikispeediaHarnessConfig()).toolsets == [] def test_wikispeedia_system_prompt_matches_available_tools( @@ -141,8 +188,12 @@ def test_wikispeedia_system_prompt_matches_available_tools( ) -> None: module = load_module(monkeypatch) - with_back = module.load_taskset(allow_go_back=True) - without_back = module.load_taskset(allow_go_back=False) + with_back = module.load_taskset( + config=module.WikispeediaTasksetConfig(allow_go_back=True) + ) + without_back = module.load_taskset( + config=module.WikispeediaTasksetConfig(allow_go_back=False) + ) assert "go_back" in with_back.system_prompt[0]["content"] assert "go_back" not in without_back.system_prompt[0]["content"] @@ -156,12 +207,16 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime( module = load_module(monkeypatch) wiki = make_small_wiki(module) monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki) - env = module.load_environment( - config=vf.EnvConfig(), - train_size=2, - eval_size=1, - min_path_length=1, - max_path_length=1, + env = vf.Env( + taskset=module.load_taskset( + config=module.WikispeediaTasksetConfig( + train_size=2, + eval_size=1, + min_path_length=1, + max_path_length=1, + ) + ), + harness=module.load_harness(config=module.WikispeediaHarnessConfig()), ) task = module.vf.Task(list(env.taskset.source())[0]).freeze() state = module.vf.State.for_task(task) diff --git a/tests/test_mcp_search_env.py b/tests/test_mcp_search_env.py index bb23884fd..7fcb509f0 100644 --- a/tests/test_mcp_search_env.py +++ b/tests/test_mcp_search_env.py @@ -26,7 +26,9 @@ def _load_mcp_search_module() -> Any: def test_mcp_search_env_is_v1_only() -> None: module = _load_mcp_search_module() - env = module.load_environment(config=vf.EnvConfig(), max_turns=4) + env = module.load_environment( + config=module.MCPSearchEnvConfig(taskset={"max_turns": 4}) + ) assert isinstance(env, vf.Env) assert isinstance(env.taskset, vf.Taskset) @@ -40,7 +42,7 @@ def test_mcp_search_env_is_v1_only() -> None: def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None: module = _load_mcp_search_module() - rows = module.load_taskset().rows() + rows = module.load_taskset(config=module.MCPSearchTasksetConfig()).rows() assert len(rows) >= 10 assert len({row["answer"] for row in rows}) == len(rows) @@ -52,7 +54,7 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None: module = _load_mcp_search_module() env = module.load_environment( - config=vf.EnvConfig(taskset={"max_turns": 3}), + config=module.MCPSearchEnvConfig(taskset={"max_turns": 3}), ) rows = env.taskset.rows() diff --git a/tests/test_opencode_harbor.py b/tests/test_opencode_harbor.py index d35d341f1..5aafc9572 100644 --- a/tests/test_opencode_harbor.py +++ b/tests/test_opencode_harbor.py @@ -28,7 +28,7 @@ def _load_opencode_module() -> Any: def test_load_environment_uses_v1_taskset_and_harness() -> None: module = _load_opencode_module() - env = module.load_environment(config=vf.EnvConfig()) + env = module.load_environment(config=module.OpenCodeHarborEnvConfig()) assert isinstance(env, vf.Env) assert isinstance(env.taskset, vf.HarborTaskset) @@ -52,7 +52,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config() -> None: module = _load_opencode_module() env = module.load_environment( - config=vf.EnvConfig( + config=module.OpenCodeHarborEnvConfig( taskset={ "task_names": ["task-a"], "cpu_cores": 1.5, diff --git a/tests/test_v1_bfcl.py b/tests/test_v1_bfcl.py index f44037132..015e4d208 100644 --- a/tests/test_v1_bfcl.py +++ b/tests/test_v1_bfcl.py @@ -75,12 +75,12 @@ def test_bfcl_public_loader_is_v1_only(monkeypatch: pytest.MonkeyPatch) -> None: seen_taskset_config: vf.TasksetConfig | None = None seen_harness_config: vf.HarnessConfig | None = None - def fake_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset: + def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset: nonlocal seen_taskset_config seen_taskset_config = config return vf.Taskset(source=[], config=config) - def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: + def fake_harness(config: vf.HarnessConfig) -> vf.Harness: nonlocal seen_harness_config seen_harness_config = config return vf.Harness(config=config) @@ -89,9 +89,13 @@ def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: monkeypatch.setattr(bfcl, "load_harness", fake_harness) env = bfcl.load_environment( - config=vf.EnvConfig(), - test_category="simple_python", - examples_per_category=0, + config=bfcl.BFCLEnvConfig( + taskset=bfcl.BFCLTasksetConfig( + test_category="simple_python", + examples_per_category=0, + ), + harness=bfcl.BFCLHarnessConfig(), + ) ) assert isinstance(env, vf.Env) @@ -110,12 +114,12 @@ def test_bfcl_loader_supports_category_groups( seen_taskset_categories = [] seen_harness_categories = [] - def fake_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset: + def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset: assert isinstance(config, bfcl.BFCLTasksetConfig) seen_taskset_categories.append(config.test_category) return vf.Taskset(source=[{"question": "q", "answer": "a"}], config=config) - def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: + def fake_harness(config: vf.HarnessConfig) -> vf.Harness: assert isinstance(config, bfcl.BFCLHarnessConfig) seen_harness_categories.append(config.test_category) return vf.Harness(config=config) @@ -124,9 +128,13 @@ def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: monkeypatch.setattr(bfcl, "load_harness", fake_harness) env = bfcl.load_environment( - config=vf.EnvConfig(), - test_categories=["simple_python", "simple_java"], - examples_per_category=0, + config=bfcl.BFCLEnvConfig( + taskset=bfcl.BFCLTasksetConfig( + test_categories=["simple_python", "simple_java"], + examples_per_category=0, + ), + harness=bfcl.BFCLHarnessConfig(), + ) ) assert isinstance(env, root_vf.EnvGroup) diff --git a/tests/test_v1_config_extension.py b/tests/test_v1_config_extension.py index e6b4db29a..4b282d643 100644 --- a/tests/test_v1_config_extension.py +++ b/tests/test_v1_config_extension.py @@ -1185,6 +1185,29 @@ def test_config_schema_is_visible_from_primary_types() -> None: assert "bindings" in vf.ToolsetConfig.schema_text() +def test_config_annotation_only_nested_config_defaults_recursively() -> None: + class LeafConfig(Config): + value: int = 1 + + class ChildConfig(Config): + leaf: LeafConfig + + class ParentConfig(Config): + child: ChildConfig + + first = ParentConfig() + second = ParentConfig() + configured = ParentConfig({"child": {"leaf": {"value": 3}}}) + + assert isinstance(first.child, ChildConfig) + assert isinstance(first.child.leaf, LeafConfig) + assert first.child.leaf.value == 1 + assert first.child is not second.child + assert first.child.leaf is not second.child.leaf + assert configured.child.leaf.value == 3 + assert "child: ChildConfig = " in ParentConfig.schema_text() + + def test_env_config_normalizes_mapping_config_to_attributes() -> None: config = EnvConfig( { @@ -1193,8 +1216,17 @@ def test_env_config_normalizes_mapping_config_to_attributes() -> None: } ) - assert config.taskset == {"taskset_id": "dict"} - assert config.harness == {"model": "configured-model"} + assert isinstance(config.taskset, TasksetConfig) + assert isinstance(config.harness, HarnessConfig) + assert config.taskset.taskset_id == "dict" + assert config.harness.model == "configured-model" + + +def test_env_config_defaults_taskset_and_harness_to_base_configs() -> None: + config = EnvConfig() + + assert isinstance(config.taskset, TasksetConfig) + assert isinstance(config.harness, HarnessConfig) def test_env_config_rejects_unknown_top_level_sections() -> None: @@ -1205,6 +1237,34 @@ def test_env_config_rejects_unknown_top_level_sections() -> None: def test_env_config_requires_child_sections_to_be_configs() -> None: with pytest.raises(ValueError): EnvConfig({"taskset": 1}) + with pytest.raises(ValueError, match="EnvConfig.taskset cannot be None"): + EnvConfig({"taskset": None}) + with pytest.raises(ValueError, match="EnvConfig.harness cannot be None"): + EnvConfig(harness=None) + + +def test_env_config_child_config_objects_must_match_domain() -> None: + class LocalTasksetConfig(TasksetConfig): + split: str = "train" + + class LocalHarnessConfig(HarnessConfig): + mode: str = "default" + + config = EnvConfig( + taskset=LocalTasksetConfig(split="test"), + harness=LocalHarnessConfig(mode="custom"), + ) + + assert isinstance(config.taskset, LocalTasksetConfig) + assert isinstance(config.harness, LocalHarnessConfig) + + class LocalConfig(Config): + split: str = "train" + + with pytest.raises(ValueError): + EnvConfig(taskset=LocalConfig()) + with pytest.raises(ValueError): + EnvConfig(harness=LocalConfig()) def test_env_config_merges_child_config_defaults_with_nested_sections() -> None: @@ -1234,25 +1294,53 @@ class LocalTasksetConfig(TasksetConfig): assert default_config.taskset.split == "kwarg" -def test_env_config_args_supplies_typed_top_level_args() -> None: - class LocalArgsConfig(Config): - split: str = "train" - max_turns: int = 4 - - config = EnvConfig( - {"args": {"max_turns": 7}}, - args=LocalArgsConfig(split="args"), +def test_config_object_merge_omits_nested_none_values() -> None: + base = HarnessConfig( + sampling_args={ + "temperature": 0.7, + "extra_body": { + "top_k": 40, + "top_p": 0.9, + }, + } ) + override = HarnessConfig( + sampling_args={ + "extra_body": { + "top_p": None, + "min_p": 0.05, + }, + "stop": [None, "DONE"], + } + ) + config = EnvConfig(EnvConfig(harness=override), harness=base) + + assert config.harness.sampling_args == { + "temperature": 0.7, + "extra_body": { + "top_k": 40, + "top_p": 0.9, + "min_p": 0.05, + }, + "stop": [None, "DONE"], + } - assert isinstance(config.args, LocalArgsConfig) - assert config.args.split == "args" - assert config.args.max_turns == 7 +def test_env_config_subclasses_cannot_define_root_fields() -> None: + with pytest.raises(TypeError, match="unsupported root env config fields"): -def test_env_config_args_accepts_arbitrary_user_args() -> None: - config = EnvConfig(args={"k1": "v1", "k2": "v2"}) + class LocalEnvConfig(EnvConfig): + split: str = "train" - assert config.args == {"k1": "v1", "k2": "v2"} + +def test_env_config_subclasses_must_use_domain_child_configs() -> None: + class LocalConfig(Config): + split: str = "train" + + with pytest.raises(TypeError, match="taskset must be typed"): + + class LocalEnvConfig(EnvConfig): + taskset: LocalConfig def test_env_config_harness_section_extends_imported_config() -> None: @@ -1335,6 +1423,56 @@ def load_environment(split: str = "train", *, config: EnvConfig) -> Env: } +def test_load_environment_coerces_env_config_subclass_sections( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module_name = "typed_env_config_subclass" + module = types.ModuleType(module_name) + seen: dict[str, object] = {} + + class LocalTasksetConfig(TasksetConfig): + split: str = "train" + + class LocalHarnessConfig(HarnessConfig): + mode: str = "default" + + class LocalEnvConfig(EnvConfig): + taskset: LocalTasksetConfig + harness: LocalHarnessConfig + + class LocalTaskset(Taskset): + config_type = LocalTasksetConfig + + class LocalHarness(Harness): + config_type = LocalHarnessConfig + + def load_environment(config: LocalEnvConfig) -> Env: + seen["config"] = config + return Env( + taskset=LocalTaskset(source=source_loader, config=config.taskset), + harness=LocalHarness(config=config.harness), + ) + + module.load_environment = load_environment + monkeypatch.setitem(sys.modules, module_name, module) + + env = vf.load_environment( + "typed-env-config-subclass", + config={ + "taskset": {"taskset_id": "typed", "split": "test"}, + "harness": {"mode": "custom"}, + }, + ) + config = seen["config"] + + assert isinstance(config, LocalEnvConfig) + assert isinstance(config.taskset, LocalTasksetConfig) + assert isinstance(config.harness, LocalHarnessConfig) + assert env.taskset.config.taskset_id == "typed" + assert env.taskset.config.split == "test" + assert env.harness.config.mode == "custom" + + def test_load_environment_supplies_default_typed_env_config( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -1457,9 +1595,24 @@ def test_reference_v1_harness_loaders_preserve_child_defaults() -> None: "environments.hello_self_judge_v1.hello_self_judge_v1" ) - assert group_reward.load_harness().config.max_turns == 1 - assert parallel_sandbox.load_harness().config.max_turns == 4 - assert self_judge.load_harness().config.max_turns == 8 + assert ( + group_reward.load_harness( + config=group_reward.GroupRewardHarnessConfig() + ).config.max_turns + == 1 + ) + assert ( + parallel_sandbox.load_harness( + config=parallel_sandbox.ParallelSandboxHarnessConfig() + ).config.max_turns + == 4 + ) + assert ( + self_judge.load_harness( + config=self_judge.SelfJudgeHarnessConfig() + ).config.max_turns + == 8 + ) def test_bfcl_loader_preserves_mapping_config_sections( @@ -1482,7 +1635,7 @@ def fake_harness(config: object = None, **kwargs: object) -> Harness: monkeypatch.setattr(module, "load_harness", fake_harness) env = module.load_environment( - config=EnvConfig( + config=module.BFCLEnvConfig( taskset={"taskset_id": "bfcl-env-args"}, harness={"model": "bfcl-model"}, ) @@ -1508,7 +1661,7 @@ def fake_taskset(config: object = None) -> Taskset: monkeypatch.setattr(module, "load_taskset", fake_taskset) env = module.load_environment( - config=EnvConfig( + config=module.Tau2EnvConfig( taskset={"max_turns": 7}, harness={"model": "configured-model", "max_turns": 3}, ) @@ -1623,17 +1776,16 @@ def test_self_judge_loader_projects_shortcuts_to_child_configs() -> None: "environments.hello_self_judge_v1.hello_self_judge_v1" ) - taskset = module.load_taskset(num_examples=2) - harness = module.load_harness(max_turns=3) + taskset = module.load_taskset(config=module.SelfJudgeTasksetConfig(num_examples=2)) + harness = module.load_harness(config=module.SelfJudgeHarnessConfig(max_turns=3)) shortcut_env = module.load_environment( - num_examples=2, - max_turns=3, - config=EnvConfig(), + config=module.SelfJudgeEnvConfig( + taskset={"num_examples": 2}, + harness={"max_turns": 3}, + ), ) override_env = module.load_environment( - num_examples=2, - max_turns=3, - config=EnvConfig( + config=module.SelfJudgeEnvConfig( taskset={"num_examples": 1}, harness={"max_turns": 5}, ), diff --git a/tests/test_v1_group_reward_env.py b/tests/test_v1_group_reward_env.py index 8280523b0..3e884650b 100644 --- a/tests/test_v1_group_reward_env.py +++ b/tests/test_v1_group_reward_env.py @@ -2,16 +2,21 @@ import pytest -import verifiers.v1 as vf from verifiers.clients import Client from verifiers.types import RolloutInput -from environments.hello_group_reward_v1.hello_group_reward_v1 import load_environment +from environments.hello_group_reward_v1.hello_group_reward_v1 import ( + GroupRewardEnvConfig, + GroupRewardTasksetConfig, + load_environment, +) @pytest.mark.asyncio async def test_hello_group_reward_v1_scores_full_group_lifecycle() -> None: - env = load_environment(num_examples=1, config=vf.EnvConfig()) + env = load_environment( + config=GroupRewardEnvConfig(taskset=GroupRewardTasksetConfig(num_examples=1)) + ) assert env.requires_group_rollouts assert env.provides_advantages diff --git a/tests/test_v1_rlm_swe.py b/tests/test_v1_rlm_swe.py index 6c4cb2ed2..235c638ad 100644 --- a/tests/test_v1_rlm_swe.py +++ b/tests/test_v1_rlm_swe.py @@ -135,7 +135,7 @@ def fake_load_dataset(dataset_name: str, **kwargs: object) -> Dataset: monkeypatch.setattr(rlm_swe_v1, "load_dataset", fake_load_dataset) env = rlm_swe_v1.load_environment( - config=vf.EnvConfig( + config=rlm_swe_v1.RlmSweEnvConfig( taskset=rlm_swe_v1.RlmSweTasksetConfig( dataset_name="fake-r2e", timeout_minutes=30, @@ -171,7 +171,7 @@ def fake_load_dataset(dataset_name: str, **kwargs: object) -> Dataset: def test_rlm_swe_taskset_hooks_are_registered_with_runtime(): - taskset = rlm_swe_v1.load_taskset() + taskset = rlm_swe_v1.load_taskset(config=rlm_swe_v1.RlmSweTasksetConfig()) env = vf.Env(taskset=taskset) setup_names = [handler.__name__ for handler in env.harness.runtime.rollout_setup] @@ -262,7 +262,7 @@ def test_rlm_swe_get_env_vars_uses_configured_repo_path(): def test_rlm_swe_reward_rejects_pytest_summary_without_nodeid(): - taskset = rlm_swe_v1.load_taskset() + taskset = rlm_swe_v1.load_taskset(config=rlm_swe_v1.RlmSweTasksetConfig()) test_output = """ =========================== short test summary info ============================ PASSED tests/test_example.py diff --git a/verifiers/scripts/init.py b/verifiers/scripts/init.py index 41de3b572..39c152510 100644 --- a/verifiers/scripts/init.py +++ b/verifiers/scripts/init.py @@ -35,21 +35,22 @@ ```bash prime eval run {env_id_dash} \ -m openai/gpt-4.1-mini \ - -n 20 -r 3 -t 1024 -T 0.7 \ - -a '{{"key": "value"}}' # env-specific args as JSON + -n 20 -r 3 -t 1024 -T 0.7 ``` Notes: -- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. +- Put task-owned settings under `[env.taskset]` and harness-owned settings under `[env.harness]` in TOML configs. -### Environment Arguments -Document any supported environment arguments and their meaning. Example: +### Taskset Config +Document any taskset config fields and their meaning. Example: -| Arg | Type | Default | Description | +| Field | Type | Default | Description | | --- | ---- | ------- | ----------- | -| `foo` | str | `"bar"` | What this controls | | `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | +### Harness Config +Document any harness config fields and their meaning. + ### Metrics Summarize key metrics your rubric emits and how they’re interpreted. @@ -139,16 +140,64 @@ __all__ = ["load_environment"] """ -ENVIRONMENT_TEMPLATE = '''\ +ENVIRONMENT_TEMPLATE = """\ import verifiers as vf -def load_environment(**kwargs) -> vf.Environment: - """ - Loads a custom environment. - """ - raise NotImplementedError("Implement your custom environment here.") -''' +def source(): + return [ + { + "prompt": [{"role": "user", "content": "Reverse abc."}], + "answer": "cba", + } + ] + + +@vf.reward(weight=1.0) +async def exact_answer(task, state) -> float: + return float(task["answer"] in str(state.get("completion") or "")) + + +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: + return vf.Taskset(source=source, rewards=[exact_answer], config=config) + + +def load_environment(config: vf.EnvConfig) -> vf.Env: + return vf.Env(taskset=load_taskset(config=config.taskset)) +""" + +HARNESS_ENVIRONMENT_TEMPLATE = """\ +import verifiers as vf + + +def source(): + return [ + { + "prompt": [{"role": "user", "content": "Reverse abc."}], + "answer": "cba", + } + ] + + +@vf.reward(weight=1.0) +async def exact_answer(task, state) -> float: + return float(task["answer"] in str(state.get("completion") or "")) + + +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: + return vf.Taskset(source=source, rewards=[exact_answer], config=config) + + +def load_harness(config: vf.HarnessConfig) -> vf.Harness: + return vf.Harness(config=config) + + +def load_environment(config: vf.EnvConfig) -> vf.Env: + return vf.Env( + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), + ) +""" OPENENV_ENVIRONMENT_TEMPLATE = """\ import verifiers as vf @@ -315,6 +364,7 @@ def init_environment( rewrite_readme: bool = False, multi_file: bool = False, openenv: bool = False, + with_harness: bool = False, ) -> Path: """ Initialize a new verifiers environment. @@ -373,7 +423,12 @@ def init_environment( # create environment file if it doesn't exist environment_file = environment_dir / f"{env_id_underscore}.py" if not environment_file.exists(): - template = OPENENV_ENVIRONMENT_TEMPLATE if openenv else ENVIRONMENT_TEMPLATE + if openenv: + template = OPENENV_ENVIRONMENT_TEMPLATE + elif with_harness: + template = HARNESS_ENVIRONMENT_TEMPLATE + else: + template = ENVIRONMENT_TEMPLATE environment_file.write_text(template) else: print( @@ -418,6 +473,12 @@ def main(): default=False, help="Initialize with the enforced OpenEnv layout (proj/ + vf-build workflow).", ) + parser.add_argument( + "--with-harness", + action="store_true", + default=False, + help="Include an explicit v1 load_harness stub.", + ) args = parser.parse_args() init_environment( @@ -426,6 +487,7 @@ def main(): rewrite_readme=args.rewrite_readme, multi_file=args.multi_file, openenv=args.openenv, + with_harness=args.with_harness, ) diff --git a/verifiers/serve/types.py b/verifiers/serve/types.py index 25ddbbc94..3c252c230 100644 --- a/verifiers/serve/types.py +++ b/verifiers/serve/types.py @@ -1,8 +1,9 @@ from asyncio import Future from enum import Enum -from typing import Annotated, Literal, TypeVar +from typing import Annotated, Literal, TypeAlias, TypeVar -from pydantic import BaseModel, BeforeValidator, ConfigDict, SkipValidation +from pydantic import BaseModel, BeforeValidator, ConfigDict +from pydantic import SkipValidation # nosemgrep: verifiers-no-skip-validation from verifiers.types import ( ClientConfig, @@ -14,6 +15,12 @@ CoercedRolloutOutput = Annotated[ RolloutOutput, BeforeValidator(lambda v: RolloutOutput(v)) ] +RunInput: TypeAlias = ( # nosemgrep: verifiers-no-skip-validation + SkipValidation[RolloutInput] +) +GroupInput: TypeAlias = ( # nosemgrep: verifiers-no-skip-validation + SkipValidation[list[RolloutInput]] +) class BaseRequest(BaseModel): @@ -41,9 +48,8 @@ class HealthResponse(BaseResponse): ... class RunRolloutRequest(BaseRequest): request_type: Literal["run_rollout"] = "run_rollout" - # skip validation because multi-modal content type + tool calls validate weirdly - # (https://github.com/PrimeIntellect-ai/prime-rl/pull/1249) - input: SkipValidation[RolloutInput] + # Pydantic rejects some provider message shapes at this protocol boundary. + input: RunInput client_config: ClientConfig model: str sampling_args: SamplingArgs @@ -58,9 +64,8 @@ class RunRolloutResponse(BaseResponse): class RunGroupRequest(BaseRequest): request_type: Literal["run_group"] = "run_group" - # skip validation because multi-modal content type + tool calls validate weirdly - # (https://github.com/PrimeIntellect-ai/prime-rl/pull/1249) - group_inputs: SkipValidation[list[RolloutInput]] + # Pydantic rejects some provider message shapes at this protocol boundary. + group_inputs: GroupInput client_config: ClientConfig model: str sampling_args: SamplingArgs diff --git a/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md b/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md index 92a04caf7..36d492c39 100644 --- a/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +++ b/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md @@ -1,73 +1,252 @@ -# v1 Environment Best Practices - -This is the working checklist for building environments with the v1 -Taskset/Harness pattern. - -## Do - -- Expose `load_environment(config: vf.EnvConfig) -> vf.Env` for v1 - environments. The loader receives a typed config object from the caller. -- Import the public API with `import verifiers as vf`. `verifiers.v1` remains - available for framework-internal tests and narrow module-level checks, but - user environment code should use the top-level namespace. -- Use typed Pydantic config objects in Python code. Raw mappings are for TOML, - CLI, and other external boundaries. -- Keep `config` parameters to one concrete Pydantic config type or `None`. - Do not advertise unions of mappings, base configs, and specific configs. -- Treat `Mapping[str, object]` as an explicit boundary type. Accept it only for - intentionally dynamic payloads such as task rows, protocol messages, - sandbox/program specs, or Pydantic config fields that store arbitrary user - objects. Prefer a named alias such as `ConfigMap`, `TaskRow`, or - `Objects` over spelling the broad type in user-facing - signatures. -- Do not use raw `Any` in v1 environment code. If a value is intentionally - arbitrary, give that boundary a named type in `verifiers.v1.types`. -- Put task/data/scoring settings on a `TasksetConfig` owned by the taskset. -- Put rollout program/runtime settings on a `HarnessConfig` owned by the - harness. For example, `vf.RLM` takes `vf.RLMConfig`. -- Keep environment files as wiring: taskset construction, harness construction, - and small policy choices that compose the two. -- Prefer v1-native framework objects over stdlib-shaped user code. User-facing - environment code should read as Verifiers code first. -- Use `Taskset(objects=..., bindings=...)` for shared extractors, judges, - clients, and other dependencies that signal functions need. -- Compose related categories inside one taskset only when they share the same - harness lifecycle and scoring contract. -- Expose explicit typed loaders for separate v1 envs when categories need - different tasksets, harnesses, or lifecycle behavior. -- Keep dual v0/v1 loaders explicit only while migration is intentionally - dual-stack. - -## Don't - -- Do not mirror every taskset or harness config field as `load_environment` - kwargs. -- Do not put harness settings on taskset configs or taskset settings on harness - configs. -- Do not add environment-level config subclasses to carry fields already owned - by a taskset or harness config. -- Do not wrap v1 `Env` objects in the v0 `EnvGroup`. -- Do not add thin intermediate harness types that only restate `Harness`. - Reusable command agents should be direct `Harness` subclasses with their own - typed config only when they have real behavior to own. -- Do not add heterogeneous `TasksetGroup` routing as a substitute for a real - v1 suite abstraction. -- Do not overfit v1 APIs to names or layering inherited from - `research-environments`; keep the logical v1 ownership boundaries correct. -- Do not use vague config names that only repeat the component name. Name the - actual thing being configured: `rlm_repo_ref`, not `rlm_ref`; `env_vars`, not - `rlm_env`. -- Do not write global helper functions in environment files. Rare exceptions - are process-level handles, such as a lock, where a module-level object is the - cleanest way to assert process-wide control. -- Do not make users manipulate paths, package resources, or other stdlib details - when the framework can express the intent directly. - -## Enforcement - -- Put deterministic, repo-specific style rules in Semgrep rather than custom - AST scripts. Ruff owns generic lint and format, ty owns type correctness, and - Semgrep owns Verifiers-specific policy checks. -- Keep Semgrep rules principle-based. They should encode stable contracts such - as narrow config parameters and no broad user-facing `Any`, not one-off bans - for removed historical names. +# v1 Environment Contract + +This file is the authoring contract for environment builders using the v1 +Taskset/Harness pattern. Use it as the first source of truth when adding, +migrating, or reviewing v1 environments. + +The contract is intentionally strict. When a rule feels stricter than necessary, +prefer the strict interpretation and relax it only after a concrete environment +proves the looser surface is needed. + +## Contract Surface + +1. Environment modules MUST expose `load_environment(config: EnvConfigType)`. + `EnvConfigType` is `vf.EnvConfig` for base taskset/harness configs, or an + environment-local `vf.EnvConfig` subclass that binds concrete child config + types. +2. Environment modules SHOULD expose `load_taskset(config: TasksetConfigType)` + whenever the taskset has any reusable construction logic. +3. Environment modules SHOULD expose `load_harness(config: HarnessConfigType)` + when the harness has reusable construction logic or a custom config type. + Environments using the default harness MAY construct `vf.Harness` directly in + `load_environment`. +4. Public `load_taskset` and `load_harness` config parameters MUST be required + and typed as one concrete Pydantic config type. Defaults come from + `EnvConfig`; do not advertise unions of mappings, base configs, and specific + configs. +5. `load_environment` MUST NOT mirror taskset or harness fields as keyword + arguments. The root env loader receives the typed envelope; the child configs + own all environment-specific settings. + +## Why The Types Matter + +Verifiers uses loader annotations to decide how config should be parsed before +your loader runs. The type annotation is not cosmetic. + +1. If `load_environment` is annotated with `config: vf.EnvConfig`, TOML + `[env.taskset]` validates against `vf.TasksetConfig` and `[env.harness]` + validates against `vf.HarnessConfig`. +2. If your environment has custom taskset fields, `load_environment` MUST use an + `EnvConfig` subclass whose `taskset` annotation is your concrete + `TasksetConfig` subclass. Otherwise those TOML fields are rejected before + your loader can convert or forward them. +3. If your environment has custom harness fields, the same rule applies to the + `harness` annotation. +4. The config object that reaches `load_environment` is already validated and + typed. Do not reconstruct child config objects just to recover their type. +5. `load_taskset(config: MyTasksetConfig)` and + `load_harness(config: MyHarnessConfig)` make the reusable construction + contract explicit. They also give tests and examples one stable place to + exercise child config behavior. +6. Root env kwargs behave differently from TOML child sections. TOML + `[env.taskset]` and `[env.harness]` route into the env config envelope; CLI + `-a` passes loader kwargs. This is why CLI child config overrides must be + nested under `{"config": ...}`. + +## Config Envelope + +`vf.EnvConfig` is a typed envelope with exactly two child sections: + +```python +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: MyHarnessConfig +``` + +1. `EnvConfig.taskset` MUST be typed as a `vf.TasksetConfig` subclass. +2. `EnvConfig.harness` MUST be typed as a `vf.HarnessConfig` subclass. +3. `EnvConfig` subclasses MUST NOT define additional root fields. If a field is + specific to the environment, it belongs on the taskset or harness config that + owns the behavior. +4. `taskset` and `harness` MUST NOT be `None`. Omit the section to use the + default config. +5. Annotation-only `vf.Config` fields default to their config class. Do not add + `Field(default_factory=...)` for nested `vf.Config` fields unless the default + must be something other than the annotated class. + +## Ownership Rules + +Taskset configs own task behavior: + +- task sources, eval sources, splits, IDs, cache locations, and dataset sizing +- task prompts, user behavior, tools, tool bindings, and task-local objects +- task-specific scoring, metrics, rewards, advantages, and policy weights +- task-specific difficulty, category, filtering, and sampling controls + +Harness configs own rollout and execution behavior: + +- framework programs, command programs, sandboxes, and lifecycle wiring +- model/client/sampling defaults for standalone harnesses +- harness prompts, harness toolsets, harness bindings, and harness-local objects +- rollout limits, trajectory handling, and execution-specific scoring hooks + +Do not put harness settings on taskset configs or taskset settings on harness +configs. Do not add an environment-level config class just to carry fields that +already belong to a taskset or harness. + +## Canonical Patterns + +### Default Harness + +Use this when the taskset owns the environment and the base harness is enough. + +```python +import verifiers as vf + + +class MyTasksetConfig(vf.TasksetConfig): + split: str = "train" + + +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: vf.HarnessConfig + + +def load_taskset(config: MyTasksetConfig) -> vf.Taskset: + return vf.Taskset(source=..., config=config) + + +def load_environment(config: MyEnvConfig) -> vf.Env: + return vf.Env( + taskset=load_taskset(config=config.taskset), + harness=vf.Harness(config=config.harness), + ) +``` + +### Custom Harness + +Use this when the harness owns a reusable execution mechanism or config surface. + +```python +import verifiers as vf + + +class MyTasksetConfig(vf.TasksetConfig): + split: str = "train" + + +class MyHarnessConfig(vf.HarnessConfig): + timeout_seconds: int = 120 + + +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: MyHarnessConfig + + +def load_taskset(config: MyTasksetConfig) -> vf.Taskset: + return vf.Taskset(source=..., config=config) + + +def load_harness(config: MyHarnessConfig) -> vf.Harness: + return vf.Harness(program=..., config=config) + + +def load_environment(config: MyEnvConfig) -> vf.Env: + return vf.Env( + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), + ) +``` + +## External Configuration + +TOML config routes v1 environment config through child sections: + +```toml +[env.taskset] +split = "eval" + +[env.harness] +max_turns = 20 +``` + +CLI `-a` passes loader kwargs. For v1 child config overrides, nest under the +`config` loader argument: + +```bash +prime eval run my-env -a '{"config":{"taskset":{"split":"eval"},"harness":{"max_turns":20}}}' +``` + +Raw mappings are boundary values for TOML, CLI, JSON, and other external input +surfaces. Inside Python environment code, use typed config objects. + +## Type Boundaries + +1. Import the public API with `import verifiers as vf`. `verifiers.v1` remains + available for framework-internal tests and narrow module-level checks, but + user environment code should use the top-level namespace. +2. Use Pydantic config models for structured configuration. +3. Treat `Mapping[str, object]` as an explicit boundary type. Accept it only for + intentionally dynamic payloads such as task rows, protocol messages, + sandbox/program specs, or config fields that store arbitrary user objects. +4. Prefer a named alias such as `ConfigMap`, `TaskRow`, or `Objects` over + spelling a broad mapping type in a user-facing signature. +5. Do not use raw `Any` in v1 environment code. If a value is intentionally + arbitrary, give that boundary a named type in `verifiers.v1.types`. +6. Avoid `object`, broad unions, and untyped mappings unless arbitrary user data + is genuinely the contract. +7. Do not bypass Pydantic validation for v1 config fields. Treat + `SkipValidation` and equivalent escape hatches as outside the contract unless + a boundary explicitly requires unvalidated framework-owned values. + +## Environment Structure + +1. Keep environment files as wiring: taskset construction, harness construction, + and small policy choices that compose the two. +2. Prefer v1-native framework objects over stdlib-shaped user code. User-facing + environment code should read as Verifiers code first. +3. Use `Taskset(objects=..., bindings=...)` for shared extractors, judges, + clients, and other dependencies that signal functions need. +4. Compose related categories inside one taskset only when they share the same + harness lifecycle and scoring contract. +5. Expose explicit typed loaders for separate v1 envs when categories need + different tasksets, harnesses, or lifecycle behavior. +6. Keep dual v0/v1 loaders explicit only while migration is intentionally + dual-stack. +7. Do not wrap v1 `Env` objects in the v0 `EnvGroup`. +8. Do not add thin intermediate harness types that only restate `Harness`. + Reusable command agents should be direct `Harness` subclasses with their own + typed config only when they have real behavior to own. +9. Do not add heterogeneous `TasksetGroup` routing as a substitute for a real v1 + suite abstraction. +10. Do not write global helper functions in environment files. Rare exceptions + are process-level handles, such as a lock, where a module-level object is the + cleanest way to assert process-wide control. +11. Do not make users manipulate paths, package resources, or other stdlib + details when the framework can express the intent directly. + +## Naming + +1. Config names should describe the thing being configured, not the layer that + happens to carry it. +2. Do not use vague names that only repeat the component name. Prefer + `repo_ref`, `env_vars`, or `dataset_split` over names like `rlm_ref`, + `rlm_env`, or `config`. +3. Public fields should be stable enough to appear in TOML and examples. If a + field is still experimental or internal, keep it behind a less public + abstraction. + +## Contract Review + +Before adding or changing a v1 environment, verify that: + +1. `load_environment` has one public config argument. +2. The env config root has only `taskset` and `harness`. +3. Every environment-specific field belongs to either the taskset config or the + harness config. +4. `load_taskset` and `load_harness` use one concrete config type each. +5. Raw mappings appear only at external or intentionally dynamic boundaries. +6. Validation bypasses, broad unions, and root env kwargs are absent unless the + file has a clear boundary-level reason for them. diff --git a/verifiers/v1/README.md b/verifiers/v1/README.md index e0023b91a..b5fc96290 100644 --- a/verifiers/v1/README.md +++ b/verifiers/v1/README.md @@ -148,7 +148,7 @@ async def contains_answer(task, state) -> float: return float(task["answer"] in str(state.get("completion") or "")) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[contains_answer], config=config) @@ -205,8 +205,7 @@ class GSM8KTasksetConfig(vf.TasksetConfig): split: str = "train" -def load_taskset(config: vf.TasksetConfig | None = None): - config = GSM8KTasksetConfig(config) +def load_taskset(config: GSM8KTasksetConfig): dataset_name = config.dataset_name split = config.split @@ -1135,11 +1134,11 @@ recommended loader shape is: import verifiers as vf -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[exact], config=config) -def load_harness(config: vf.HarnessConfig | None = None): +def load_harness(config: vf.HarnessConfig): return vf.Harness(config=config) @@ -1157,8 +1156,8 @@ def load_environment(config: vf.EnvConfig): return vf.Env(taskset=load_taskset(config=config.taskset)) ``` -With that loader, eval TOML routes named environment args through `args` and v1 -config through the `taskset`/`harness` sections: +With that loader, eval TOML routes v1 config through the `taskset`/`harness` +sections: ```toml # configs/eval/my-v1-env.toml @@ -1180,47 +1179,32 @@ max_turns = 4 weight = 0.5 ``` -For concise named args, define one typed args object and pass it as `args`. -`EnvConfig.args` is intentionally user-defined; environment packages decide how -those args flow into taskset and harness construction. +For environment-specific settings, define leaf fields on the taskset or harness +config that owns them. An `EnvConfig` subclass only fixes the concrete taskset +and harness config types for the loader. ```python -class MyEnvArgsConfig(vf.Config): +class MyTasksetConfig(vf.TasksetConfig): split: str = "train" - max_turns: int = 10 -class MyTasksetConfig(vf.TasksetConfig): - split: str = "train" +class MyEnvConfig(vf.EnvConfig): + taskset: MyTasksetConfig + harness: vf.HarnessConfig -def load_taskset(config: vf.TasksetConfig | None = None): - config = MyTasksetConfig(config) +def load_taskset(config: MyTasksetConfig): ... -def load_harness(config: vf.HarnessConfig | None = None): - config = vf.HarnessConfig(config) +def load_harness(config: vf.HarnessConfig): ... -def load_environment( - config: vf.EnvConfig, - split: str = "train", - max_turns: int = 10, -): - config = vf.EnvConfig( - config, - args=MyEnvArgsConfig(split=split, max_turns=max_turns), - ) - args = MyEnvArgsConfig(config.args) +def load_environment(config: MyEnvConfig): return vf.Env( - taskset=load_taskset( - config=MyTasksetConfig(config.taskset, split=args.split) - ), - harness=load_harness( - config=vf.HarnessConfig(config.harness, max_turns=args.max_turns) - ), + taskset=load_taskset(config=config.taskset), + harness=load_harness(config=config.harness), ) ``` @@ -1239,12 +1223,12 @@ max_tokens = 4096 [[env]] id = "primeintellect/my-v1-env" -[env.args] -split = "train" - [env.harness] max_turns = 8 +[env.taskset] +split = "train" + [env.taskset.scoring.exact_answer] weight = 1.0 ``` diff --git a/verifiers/v1/RE_MIGRATION.md b/verifiers/v1/RE_MIGRATION.md index 05dd80f1f..81cfdb527 100644 --- a/verifiers/v1/RE_MIGRATION.md +++ b/verifiers/v1/RE_MIGRATION.md @@ -52,7 +52,7 @@ Every migrated package should expose: import verifiers as vf -def load_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset: +def load_taskset(config: vf.TasksetConfig) -> vf.Taskset: return vf.Taskset( source=load_rows, system_prompt=SYSTEM_PROMPT, @@ -63,7 +63,7 @@ def load_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset: ) -def load_harness(config: vf.HarnessConfig | None = None) -> vf.Harness: +def load_harness(config: vf.HarnessConfig) -> vf.Harness: return vf.Harness(config=config) @@ -160,7 +160,7 @@ async def exact(task, state) -> float: return float(str(task["answer"]).strip() in response) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset(source=source, rewards=[exact], config=config) @@ -244,7 +244,7 @@ def load_toolset(config=None): ) -def load_taskset(config: vf.TasksetConfig | None = None): +def load_taskset(config: vf.TasksetConfig): return vf.Taskset( source=source, toolsets=[load_toolset()], diff --git a/verifiers/v1/config.py b/verifiers/v1/config.py index 0c2b7276b..d631e1f39 100644 --- a/verifiers/v1/config.py +++ b/verifiers/v1/config.py @@ -7,10 +7,11 @@ BaseModel, ConfigDict, Field, - SkipValidation, + ValidationInfo, field_validator, model_validator, ) +from pydantic_core import PydanticUndefined from typing_extensions import Self from .types import ( @@ -67,6 +68,23 @@ class Config(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid") supports_config_ref: ClassVar[bool] = False + @classmethod + def __pydantic_init_subclass__(cls, **kwargs: object) -> None: + super().__pydantic_init_subclass__(**kwargs) + changed = False + for field in cls.model_fields.values(): + annotation = field.annotation + if ( + field.is_required() + and isinstance(annotation, type) + and issubclass(annotation, Config) + ): + field.default = PydanticUndefined + field.default_factory = annotation + changed = True + if changed: + cls.model_rebuild(force=True) + def __init__(self, config: ConfigSource | None = None, /, **data: object): super().__init__(**type(self)._merge_config_data(config, data)) @@ -322,47 +340,68 @@ def validate_bindings(cls, value: object) -> Bindings: class EnvConfig(Config): - args: SkipValidation[BaseModel | ConfigMap] | None = None - taskset: SkipValidation[BaseModel | ConfigMap] | None = None - harness: SkipValidation[BaseModel | ConfigMap] | None = None + taskset: TasksetConfig = Field(default_factory=TasksetConfig) + harness: HarnessConfig = Field(default_factory=HarnessConfig) - @field_validator("args") @classmethod - def validate_args(cls, value: object) -> object: - if value is not None: - try: - config_data(value) - except TypeError as exc: - raise ValueError(str(exc)) from exc - return value - - @field_validator("taskset", "harness") + def __pydantic_init_subclass__(cls, **kwargs: object) -> None: + super().__pydantic_init_subclass__(**kwargs) + extra_fields = set(cls.model_fields) - set(EnvConfig.model_fields) + if extra_fields: + raise TypeError( + f"{cls.__name__} defines unsupported root env config fields: " + f"{', '.join(sorted(extra_fields))}. Put env-specific settings on " + "a TasksetConfig or HarnessConfig instead." + ) + for field_name, expected_type in ( + ("taskset", TasksetConfig), + ("harness", HarnessConfig), + ): + annotation = cls.model_fields[field_name].annotation + if not ( + isinstance(annotation, type) and issubclass(annotation, expected_type) + ): + raise TypeError( + f"{cls.__name__}.{field_name} must be typed as a " + f"{expected_type.__name__} subclass." + ) + + @field_validator("taskset", "harness", mode="before") @classmethod - def validate_child_config(cls, value: object) -> object: - if value is not None: - try: - config_data(value) - except TypeError as exc: - raise ValueError(str(exc)) from exc + def validate_child_config(cls, value: object, info: ValidationInfo) -> object: + if value is None: + raise ValueError( + f"EnvConfig.{info.field_name} cannot be None. " + "Omit the section to use the default config." + ) + try: + config_data(value) + except TypeError as exc: + raise ValueError(str(exc)) from exc return value @classmethod def _merge_config_data( cls, config: ConfigSource | None, data: ConfigData ) -> ConfigData: - data = omit_none(data) + data = dict(data) if config is None: return data base = config_data(config, cls) - for section in ("args", "taskset", "harness"): + for section in ("taskset", "harness"): + default_provided = section in data + override_provided = section in base default = data.get(section) override = base.pop(section, None) - if default is None: - if override is not None: - data[section] = override + if default_provided: + if override_provided: + if default is None or override is None: + data[section] = None + else: + data[section] = merge_child_config(default, override) continue - if override is not None: - data[section] = merge_child_config(default, override) + if override_provided: + data[section] = override base.update(data) return base diff --git a/verifiers/v1/utils/config_utils.py b/verifiers/v1/utils/config_utils.py index 0cf3b2a25..cbd895f95 100644 --- a/verifiers/v1/utils/config_utils.py +++ b/verifiers/v1/utils/config_utils.py @@ -24,7 +24,7 @@ def config_data(value: object, target: type[BaseModel] | None = None) -> ConfigD if value is None: data: ConfigData = {} elif isinstance(value, BaseModel): - data = value.model_dump(exclude_none=True, exclude_unset=True) + data = model_config_data(value) if target is not None: data = { key: item for key, item in data.items() if key in target.model_fields @@ -36,6 +36,29 @@ def config_data(value: object, target: type[BaseModel] | None = None) -> ConfigD return data +def model_config_data(value: BaseModel) -> ConfigData: + data: ConfigData = {} + for key in value.model_fields_set: + item = getattr(value, key) + if item is not None: + data[key] = config_dump_value(item) + return data + + +def config_dump_value(value: object) -> object: + if isinstance(value, BaseModel): + return model_config_data(value) + if isinstance(value, Mapping): + return { + key: config_dump_value(item) + for key, item in string_mapping(cast(ConfigInputMap, value)).items() + if item is not None + } + if isinstance(value, list | tuple): + return [config_dump_value(item) for item in value] + return value + + def omit_none(data: ConfigMap) -> ConfigData: return {key: value for key, value in data.items() if value is not None}