diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py index d51c38262..38e67aaa9 100644 --- a/tests/test_client_multimodal_types.py +++ b/tests/test_client_multimodal_types.py @@ -1,5 +1,6 @@ import pytest from types import SimpleNamespace +from typing import Any from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient from verifiers.types import ( @@ -18,6 +19,26 @@ from verifiers.utils.response_utils import parse_response_message +class _RecordingCreate: + def __init__(self, response: Any) -> None: + self.response = response + self.calls: list[dict[str, Any]] = [] + + async def create(self, **kwargs: Any) -> Any: + self.calls.append(kwargs) + return self.response + + +def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]: + recorder = _RecordingCreate(response) + return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder + + +def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]: + recorder = _RecordingCreate(response) + return SimpleNamespace(messages=recorder), recorder + + @pytest.mark.asyncio async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): client = OpenAIChatCompletionsClient(object()) @@ -52,6 +73,59 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): ] +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model", "effort"), + [ + ("anthropic/claude-opus-4.7", "xhigh"), + ("anthropic/claude-sonnet-4.6", "max"), + ], +) +async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity( + model: str, effort: str +): + recording_client, recorder = _recording_openai(SimpleNamespace()) + client = OpenAIChatCompletionsClient(recording_client) + client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1") + + response = await client.get_native_response( + prompt=[], + model=model, + sampling_args={ + "n": 1, + "reasoning_effort": effort, + "extra_body": {"reasoning": {"enabled": True}}, + }, + ) + + assert response is recorder.response + call = recorder.calls[0] + assert "reasoning_effort" not in call + assert call["extra_body"] == { + "reasoning": {"enabled": True}, + "verbosity": effort, + } + + +@pytest.mark.asyncio +async def test_openrouter_anthropic_reasoning_effort_enables_reasoning(): + recording_client, recorder = _recording_openai(SimpleNamespace()) + client = OpenAIChatCompletionsClient(recording_client) + client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1") + + await client.get_native_response( + prompt=[], + model="anthropic/claude-opus-4.7", + sampling_args={"reasoning_effort": "high"}, + ) + + call = recorder.calls[0] + assert call["extra_body"] == { + "reasoning": {"enabled": True}, + "verbosity": "high", + } + + @pytest.mark.asyncio async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts(): pytest.importorskip("anthropic") @@ -151,6 +225,58 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag ] +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model", "effort"), + [("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")], +) +async def test_anthropic_reasoning_effort_maps_to_output_config( + model: str, effort: str +): + pytest.importorskip("anthropic") + from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient + + recording_client, recorder = _recording_anthropic(SimpleNamespace()) + client = AnthropicMessagesClient(recording_client) + + response = await client.get_native_response( + prompt=[], + model=model, + sampling_args={"max_tokens": 128, "reasoning_effort": effort}, + ) + + assert response is recorder.response + call = recorder.calls[0] + assert "reasoning_effort" not in call + assert call["output_config"] == {"effort": effort} + assert call["thinking"] == {"type": "adaptive"} + + +@pytest.mark.asyncio +async def test_anthropic_reasoning_effort_preserves_existing_output_config(): + pytest.importorskip("anthropic") + from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient + + recording_client, recorder = _recording_anthropic(SimpleNamespace()) + client = AnthropicMessagesClient(recording_client) + + await client.get_native_response( + prompt=[], + model="claude-opus-4-7", + sampling_args={ + "max_tokens": 128, + "reasoning_effort": "high", + "output_config": {"format": {"type": "text"}}, + }, + ) + + call = recorder.calls[0] + assert call["output_config"] == { + "format": {"type": "text"}, + "effort": "high", + } + + @pytest.mark.asyncio async def test_anthropic_from_native_response_extracts_usage(): anthropic = pytest.importorskip("anthropic") diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py index 9e80b63b7..0f440dc0f 100644 --- a/verifiers/clients/anthropic_messages_client.py +++ b/verifiers/clients/anthropic_messages_client.py @@ -342,6 +342,20 @@ async def get_native_response( ) -> AnthropicMessage: def normalize_sampling_args(sampling_args: SamplingArgs) -> dict: sampling_args = dict(sampling_args) + reasoning_effort = sampling_args.pop("reasoning_effort", None) + model_id = model.lower().replace(".", "-").replace("_", "-") + if reasoning_effort is not None and ( + model_id.startswith("anthropic/") or model_id.startswith("claude-") + ): + output_config = dict(sampling_args.get("output_config") or {}) + output_config["effort"] = reasoning_effort + sampling_args["output_config"] = output_config + if "thinking" not in sampling_args and ( + "4-7" in model_id or "4-6" in model_id + ): + sampling_args["thinking"] = {"type": "adaptive"} + elif reasoning_effort is not None: + sampling_args["reasoning_effort"] = reasoning_effort max_tokens = sampling_args.pop("max_tokens", None) sampling_args.pop("n", None) sampling_args.pop("stop", None) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c755d8dd4..c65cc90ad 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -252,6 +252,29 @@ async def get_native_response( ) -> OpenAIChatResponse: def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args = dict(sampling_args) + api_base_url = None + if hasattr(self.client, "base_url"): + api_base_url = str(self.client.base_url) + elif self._config is not None: + api_base_url = self._config.api_base_url + reasoning_effort = sampling_args.pop("reasoning_effort", None) + model_id = model.lower().replace(".", "-").replace("_", "-") + api_base_url = (api_base_url or "").lower() + if ( + reasoning_effort is not None + and ( + model_id.startswith("anthropic/") or model_id.startswith("claude-") + ) + and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url) + ): + extra_body = dict(sampling_args.get("extra_body") or {}) + extra_body["verbosity"] = reasoning_effort + reasoning = dict(extra_body.get("reasoning") or {}) + reasoning.setdefault("enabled", True) + extra_body["reasoning"] = reasoning + sampling_args["extra_body"] = extra_body + elif reasoning_effort is not None: + sampling_args["reasoning_effort"] = reasoning_effort if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") return {k: v for k, v in sampling_args.items() if v is not None}