PrimeIntellect-ai · rasdani · May 4, 2026 · May 8, 2026
diff --git a/tests/test_cli_agent_env.py b/tests/test_cli_agent_env.py
@@ -3,12 +3,16 @@
 import asyncio
 import tempfile
 from pathlib import Path
+from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 from datasets import Dataset
 
 import verifiers as vf
+from verifiers.envs.experimental.cli_agent_env import CliAgentMonitorRubric
+from verifiers.types import RolloutTiming
+from verifiers.utils.save_utils import state_to_output
 from verifiers.utils.interception_utils import serialize_intercept_response
 
 
@@ -224,6 +228,163 @@ async def test_non_streaming_intercept_tools_use_oai_schema(
         assert kwargs["tools"][0].name == "echo"
 
 
+@pytest.mark.asyncio
+async def test_cli_agent_final_log_formats_missing_first_model_call():
+    timing = RolloutTiming()
+    timing.generation.start = 100.0
+    timing.generation.end = 115.0
+    timing.setup.start = 100.0
+    timing.setup.end = 105.0
+    timing.scoring.start = 115.0
+    timing.scoring.end = 118.0
+    state = {
+        "rollout_id": "rollout_test",
+        "example_id": 2332,
+        "info": {"instance_id": "brazilian-utils__brutils-python-126"},
+        "sandbox_id": "sbx_test",
+        "trajectory": [],
+        "stop_condition": "agent_completed",
+        "agent_exit_code": 0,
+        "agent_start_time": 105.0,
+        "agent_end_time": 112.0,
+        "timing": timing,
+    }
+
+    rubric = CliAgentMonitorRubric()
+    rubric.logger = MagicMock()
+
+    await rubric.cleanup(state)
+
+    message = rubric.logger.info.call_args.args[0]
+    assert "setup_s=5.000" in message
+    assert "first_call_latency_s=n/a" in message
+    assert "agent_s=7.000" in message
+    assert "scoring_s=3.000" in message
+    assert "duration_s=18.000" in message
+
+
+@pytest.mark.asyncio
+async def test_cli_agent_failure_metrics_use_structured_failure():
+    rubric = CliAgentMonitorRubric()
+
+    assert (
+        await rubric.agent_error(
+            {
+                "failure": {
+                    "reason": "agent_poll_failed",
+                    "origin": "agent",
+                    "error_type": "AgentPollError",
+                    "root_error_type": "AgentPollError",
+                    "message": "poll failed",
+                    "logs": {},
+                }
+            }
+        )
+        == 1.0
+    )
+    assert (
+        await rubric.agent_error(
+            {
+                "failure": {
+                    "reason": "sandbox_timeout",
+                    "origin": "sandbox",
+                    "error_type": "SandboxError",
+                    "root_error_type": "SandboxError",
+                    "message": "timeout",
+                    "logs": {},
+                }
+            }
+        )
+        == 0.0
+    )
+    assert await rubric.agent_nonzero_exit({"agent_exit_code": 7}) == 1.0
+    assert await rubric.agent_nonzero_exit({"agent_exit_code": 0}) == 0.0
+    assert (
+        await rubric.agent_poll_failed(
+            {
+                "failure": {
+                    "reason": "agent_poll_failed",
+                    "origin": "agent",
+                    "error_type": "AgentPollError",
+                    "root_error_type": "AgentPollError",
+                    "message": "poll failed",
+                    "logs": {},
+                }
+            }
+        )
+        == 1.0
+    )
+
+
+def test_state_to_output_serializes_failure_and_preserves_error_fields():
+    state = vf.State(
+        input={"prompt": [], "example_id": 1, "task": "default", "info": {}}
+    )
+    state.update(
+        {
+            "example_id": 1,
+            "task": "default",
+            "prompt": [],
+            "completion": [],
+            "reward": 0.0,
+            "timing": RolloutTiming(),
+            "is_completed": True,
+            "is_truncated": False,
+            "stop_condition": "has_error",
+            "metrics": {},
+            "tool_defs": [],
+            "trajectory": [],
+            "error": vf.ModelError("model failed"),
+        }
+    )
+
+    output = state_to_output(state, state_columns=[])
+
+    assert output["error"]["error"] == "ModelError"
+    assert output["error_chain"] == "ModelError('model failed')"
+    assert output["long_error_chain"] == "ModelError"
+    assert output["failure"]["reason"] == "model_error"
+    assert output["failure"]["origin"] == "model"
+    assert output["failure"]["error_type"] == "ModelError"
+
+
+@pytest.mark.asyncio
+async def test_failure_log_collection_returns_bounded_tails(sample_dataset):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    env.sandbox_client.get_background_job = AsyncMock(
+        return_value=SimpleNamespace(stdout="a" * 13000, stderr="err")
+    )
+    state = {"sandbox_id": "sbx", "background_job": object()}
+
+    logs = await env._collect_background_job_log_tails(state)
+
+    assert logs["agent_stdout"].endswith("a" * env.FAILURE_LOG_TAIL_CHARS)
+    assert logs["agent_stdout"].startswith("...<truncated ")
+    assert logs["agent_stderr"] == "err"
+
+
+@pytest.mark.asyncio
+async def test_failure_log_collection_suppresses_read_errors(sample_dataset):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    env.logger = MagicMock()
+    env.get_failure_log_paths = lambda state: {"missing": "/tmp/missing.log"}
+    env.sandbox_client.execute_command = AsyncMock(side_effect=RuntimeError("boom"))
+    state = {"sandbox_id": "sbx"}
+
+    logs = await env.collect_failure_diagnostics(state)
+
+    assert logs == {}
+    env.logger.warning.assert_called()
+
+
 @pytest.mark.asyncio
 async def test_cli_agent_env_delivers_intercepted_tool_call_response(
     sample_dataset, mock_client

diff --git a/tests/test_swe_rollout_observability.py b/tests/test_swe_rollout_observability.py
@@ -0,0 +1,56 @@
+from types import SimpleNamespace
+
+from verifiers.envs.experimental.composable.composable_env import ComposableEnv
+from verifiers.envs.experimental.composable.harnesses.opencode import (
+    build_install_script,
+)
+from verifiers.envs.experimental.composable.tasksets.swe.swe_rebench_v2 import (
+    _patch_failure_message,
+)
+
+
+def test_patch_failure_message_includes_empty_stderr_context():
+    message = _patch_failure_message(
+        "test_patch",
+        "diff --git a/test.py b/test.py\n",
+        [
+            (
+                "git_apply",
+                "git apply /tmp/test_patch.patch",
+                "/repo",
+                SimpleNamespace(exit_code=1, stdout="", stderr=""),
+            )
+        ],
+    )
+
+    assert "patch_sha256=" in message
+    assert "patch_size=" in message
+    assert "git_apply_command='git apply /tmp/test_patch.patch'" in message
+    assert "git_apply_working_dir='/repo'" in message
+    assert "git_apply_stdout='<empty>'" in message
+    assert "git_apply_stderr='<empty>'" in message
+
+
+def test_opencode_install_script_wraps_setup_steps():
+    script = build_install_script()
+
+    assert ": > /tmp/install_progress.log" in script
+    assert "tee -a /tmp/install_progress.log" in script
+    assert 'echo "[setup] start $name"' in script
+    assert 'echo "[setup] end $name exit=$exit_code elapsed_s=$elapsed_s"' in script
+    assert 'run_setup_step "apt_dependencies"' in script
+    assert 'run_setup_step "ripgrep_install"' in script
+    assert 'run_setup_step "download_opencode"' in script
+    assert 'run_setup_step "verify_opencode_sha256"' in script
+    assert 'run_setup_step "install_opencode"' in script
+
+
+def test_observed_setup_wrapper_writes_trace_and_preserves_exit():
+    wrapped = ComposableEnv._wrap_observed_setup_command(
+        "echo hi && false", "agent_install"
+    )
+
+    assert "/tmp/vf_observed_command.log" in wrapped
+    assert "event=observed_command_started" in wrapped
+    assert "event=setup_failed" in wrapped
+    assert 'exit "${PIPESTATUS[0]}"' in wrapped
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -608,6 +608,7 @@ async def init_state(
         state["reward"] = None
         state["metrics"] = None
         state["error"] = None
+        state["failure"] = None
         state["final_env_response"] = None
         state["timing"] = RolloutTiming()
         return state