From 4241702260a5f8ec32200952c807892ae72c3798 Mon Sep 17 00:00:00 2001 From: Xeophon <46377542+xeophon@users.noreply.github.com> Date: Mon, 11 May 2026 12:47:50 +0200 Subject: [PATCH 1/3] Map Anthropic reasoning effort by provider Amp-Thread-ID: https://ampcode.com/threads/T-019e1682-ae38-76cc-bda3-d90d28b62198 Co-authored-by: Amp --- tests/test_client_multimodal_types.py | 126 ++++++++++++++++++ .../clients/anthropic_messages_client.py | 14 ++ .../clients/openai_chat_completions_client.py | 23 ++++ 3 files changed, 163 insertions(+) diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py index d51c38262..38e67aaa9 100644 --- a/tests/test_client_multimodal_types.py +++ b/tests/test_client_multimodal_types.py @@ -1,5 +1,6 @@ import pytest from types import SimpleNamespace +from typing import Any from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient from verifiers.types import ( @@ -18,6 +19,26 @@ from verifiers.utils.response_utils import parse_response_message +class _RecordingCreate: + def __init__(self, response: Any) -> None: + self.response = response + self.calls: list[dict[str, Any]] = [] + + async def create(self, **kwargs: Any) -> Any: + self.calls.append(kwargs) + return self.response + + +def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]: + recorder = _RecordingCreate(response) + return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder + + +def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]: + recorder = _RecordingCreate(response) + return SimpleNamespace(messages=recorder), recorder + + @pytest.mark.asyncio async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): client = OpenAIChatCompletionsClient(object()) @@ -52,6 +73,59 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): ] +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model", "effort"), + [ + ("anthropic/claude-opus-4.7", "xhigh"), + ("anthropic/claude-sonnet-4.6", "max"), + ], +) +async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity( + model: str, effort: str +): + recording_client, recorder = _recording_openai(SimpleNamespace()) + client = OpenAIChatCompletionsClient(recording_client) + client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1") + + response = await client.get_native_response( + prompt=[], + model=model, + sampling_args={ + "n": 1, + "reasoning_effort": effort, + "extra_body": {"reasoning": {"enabled": True}}, + }, + ) + + assert response is recorder.response + call = recorder.calls[0] + assert "reasoning_effort" not in call + assert call["extra_body"] == { + "reasoning": {"enabled": True}, + "verbosity": effort, + } + + +@pytest.mark.asyncio +async def test_openrouter_anthropic_reasoning_effort_enables_reasoning(): + recording_client, recorder = _recording_openai(SimpleNamespace()) + client = OpenAIChatCompletionsClient(recording_client) + client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1") + + await client.get_native_response( + prompt=[], + model="anthropic/claude-opus-4.7", + sampling_args={"reasoning_effort": "high"}, + ) + + call = recorder.calls[0] + assert call["extra_body"] == { + "reasoning": {"enabled": True}, + "verbosity": "high", + } + + @pytest.mark.asyncio async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts(): pytest.importorskip("anthropic") @@ -151,6 +225,58 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag ] +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model", "effort"), + [("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")], +) +async def test_anthropic_reasoning_effort_maps_to_output_config( + model: str, effort: str +): + pytest.importorskip("anthropic") + from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient + + recording_client, recorder = _recording_anthropic(SimpleNamespace()) + client = AnthropicMessagesClient(recording_client) + + response = await client.get_native_response( + prompt=[], + model=model, + sampling_args={"max_tokens": 128, "reasoning_effort": effort}, + ) + + assert response is recorder.response + call = recorder.calls[0] + assert "reasoning_effort" not in call + assert call["output_config"] == {"effort": effort} + assert call["thinking"] == {"type": "adaptive"} + + +@pytest.mark.asyncio +async def test_anthropic_reasoning_effort_preserves_existing_output_config(): + pytest.importorskip("anthropic") + from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient + + recording_client, recorder = _recording_anthropic(SimpleNamespace()) + client = AnthropicMessagesClient(recording_client) + + await client.get_native_response( + prompt=[], + model="claude-opus-4-7", + sampling_args={ + "max_tokens": 128, + "reasoning_effort": "high", + "output_config": {"format": {"type": "text"}}, + }, + ) + + call = recorder.calls[0] + assert call["output_config"] == { + "format": {"type": "text"}, + "effort": "high", + } + + @pytest.mark.asyncio async def test_anthropic_from_native_response_extracts_usage(): anthropic = pytest.importorskip("anthropic") diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py index 9e80b63b7..0f440dc0f 100644 --- a/verifiers/clients/anthropic_messages_client.py +++ b/verifiers/clients/anthropic_messages_client.py @@ -342,6 +342,20 @@ async def get_native_response( ) -> AnthropicMessage: def normalize_sampling_args(sampling_args: SamplingArgs) -> dict: sampling_args = dict(sampling_args) + reasoning_effort = sampling_args.pop("reasoning_effort", None) + model_id = model.lower().replace(".", "-").replace("_", "-") + if reasoning_effort is not None and ( + model_id.startswith("anthropic/") or model_id.startswith("claude-") + ): + output_config = dict(sampling_args.get("output_config") or {}) + output_config["effort"] = reasoning_effort + sampling_args["output_config"] = output_config + if "thinking" not in sampling_args and ( + "4-7" in model_id or "4-6" in model_id + ): + sampling_args["thinking"] = {"type": "adaptive"} + elif reasoning_effort is not None: + sampling_args["reasoning_effort"] = reasoning_effort max_tokens = sampling_args.pop("max_tokens", None) sampling_args.pop("n", None) sampling_args.pop("stop", None) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c755d8dd4..c65cc90ad 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -252,6 +252,29 @@ async def get_native_response( ) -> OpenAIChatResponse: def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args = dict(sampling_args) + api_base_url = None + if hasattr(self.client, "base_url"): + api_base_url = str(self.client.base_url) + elif self._config is not None: + api_base_url = self._config.api_base_url + reasoning_effort = sampling_args.pop("reasoning_effort", None) + model_id = model.lower().replace(".", "-").replace("_", "-") + api_base_url = (api_base_url or "").lower() + if ( + reasoning_effort is not None + and ( + model_id.startswith("anthropic/") or model_id.startswith("claude-") + ) + and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url) + ): + extra_body = dict(sampling_args.get("extra_body") or {}) + extra_body["verbosity"] = reasoning_effort + reasoning = dict(extra_body.get("reasoning") or {}) + reasoning.setdefault("enabled", True) + extra_body["reasoning"] = reasoning + sampling_args["extra_body"] = extra_body + elif reasoning_effort is not None: + sampling_args["reasoning_effort"] = reasoning_effort if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") return {k: v for k, v in sampling_args.items() if v is not None} From e727960aec3fec8f04f5797a956ec867ece050b3 Mon Sep 17 00:00:00 2001 From: Xeophon <46377542+xeophon@users.noreply.github.com> Date: Tue, 12 May 2026 15:42:59 +0200 Subject: [PATCH 2/3] Simplify Anthropic reasoning effort mapping --- tests/test_client_multimodal_types.py | 39 +++++++++++++++++++ .../clients/anthropic_messages_client.py | 23 +++++++---- .../clients/openai_chat_completions_client.py | 14 ++++--- 3 files changed, 62 insertions(+), 14 deletions(-) diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py index 38e67aaa9..ee701ffcc 100644 --- a/tests/test_client_multimodal_types.py +++ b/tests/test_client_multimodal_types.py @@ -126,6 +126,26 @@ async def test_openrouter_anthropic_reasoning_effort_enables_reasoning(): } +@pytest.mark.asyncio +async def test_openrouter_anthropic_reasoning_effort_maps_opus_4_5(): + recording_client, recorder = _recording_openai(SimpleNamespace()) + client = OpenAIChatCompletionsClient(recording_client) + client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1") + + await client.get_native_response( + prompt=[], + model="anthropic/claude-opus-4.5", + sampling_args={"reasoning_effort": "medium"}, + ) + + call = recorder.calls[0] + assert "reasoning_effort" not in call + assert call["extra_body"] == { + "reasoning": {"enabled": True}, + "verbosity": "medium", + } + + @pytest.mark.asyncio async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts(): pytest.importorskip("anthropic") @@ -252,6 +272,25 @@ async def test_anthropic_reasoning_effort_maps_to_output_config( assert call["thinking"] == {"type": "adaptive"} +@pytest.mark.asyncio +async def test_anthropic_opus_4_5_uses_output_config_without_adaptive_thinking(): + pytest.importorskip("anthropic") + from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient + + recording_client, recorder = _recording_anthropic(SimpleNamespace()) + client = AnthropicMessagesClient(recording_client) + + await client.get_native_response( + prompt=[], + model="claude-opus-4-5", + sampling_args={"max_tokens": 4096, "reasoning_effort": "medium"}, + ) + + call = recorder.calls[0] + assert call["output_config"] == {"effort": "medium"} + assert "thinking" not in call + + @pytest.mark.asyncio async def test_anthropic_reasoning_effort_preserves_existing_output_config(): pytest.importorskip("anthropic") diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py index 0f440dc0f..31611d81d 100644 --- a/verifiers/clients/anthropic_messages_client.py +++ b/verifiers/clients/anthropic_messages_client.py @@ -50,6 +50,13 @@ from verifiers.utils.client_utils import setup_anthropic_client +ANTHROPIC_ADAPTIVE_THINKING_MODELS = { + "claude-opus-4-7", + "claude-opus-4-6", + "claude-sonnet-4-6", +} + + def _handle_anthropic_overlong_prompt(func): """Decorator to handle overlong prompt errors from the Anthropic API.""" @@ -343,19 +350,19 @@ async def get_native_response( def normalize_sampling_args(sampling_args: SamplingArgs) -> dict: sampling_args = dict(sampling_args) reasoning_effort = sampling_args.pop("reasoning_effort", None) - model_id = model.lower().replace(".", "-").replace("_", "-") - if reasoning_effort is not None and ( - model_id.startswith("anthropic/") or model_id.startswith("claude-") - ): + if reasoning_effort is not None: + model_id = ( + model.lower().split("/")[-1].replace(".", "-").replace("_", "-") + ) output_config = dict(sampling_args.get("output_config") or {}) output_config["effort"] = reasoning_effort sampling_args["output_config"] = output_config - if "thinking" not in sampling_args and ( - "4-7" in model_id or "4-6" in model_id + if "thinking" not in sampling_args and any( + model_id == adaptive_model + or model_id.startswith(f"{adaptive_model}-") + for adaptive_model in ANTHROPIC_ADAPTIVE_THINKING_MODELS ): sampling_args["thinking"] = {"type": "adaptive"} - elif reasoning_effort is not None: - sampling_args["reasoning_effort"] = reasoning_effort max_tokens = sampling_args.pop("max_tokens", None) sampling_args.pop("n", None) sampling_args.pop("stop", None) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c65cc90ad..0e4ef5090 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -258,15 +258,17 @@ def normalize_sampling_args(sampling_args: SamplingArgs): elif self._config is not None: api_base_url = self._config.api_base_url reasoning_effort = sampling_args.pop("reasoning_effort", None) - model_id = model.lower().replace(".", "-").replace("_", "-") - api_base_url = (api_base_url or "").lower() + model_id = model.lower().split("/")[-1].replace(".", "-").replace("_", "-") + is_anthropic_route = ( + "openrouter.ai" in (api_base_url or "").lower() + or "pinference.ai" in (api_base_url or "").lower() + ) if ( reasoning_effort is not None - and ( - model_id.startswith("anthropic/") or model_id.startswith("claude-") - ) - and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url) + and model_id.startswith("claude-") + and is_anthropic_route ): + # OpenRouter/Pinference route Anthropic reasoning_effort through extra_body. extra_body = dict(sampling_args.get("extra_body") or {}) extra_body["verbosity"] = reasoning_effort reasoning = dict(extra_body.get("reasoning") or {}) From a4c1fc71bbf365d5acb2eb79c8d2516aa7f06a89 Mon Sep 17 00:00:00 2001 From: Xeophon <46377542+xeophon@users.noreply.github.com> Date: Tue, 12 May 2026 15:44:53 +0200 Subject: [PATCH 3/3] Remove Anthropic reasoning effort tests --- tests/test_client_multimodal_types.py | 165 -------------------------- 1 file changed, 165 deletions(-) diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py index ee701ffcc..d51c38262 100644 --- a/tests/test_client_multimodal_types.py +++ b/tests/test_client_multimodal_types.py @@ -1,6 +1,5 @@ import pytest from types import SimpleNamespace -from typing import Any from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient from verifiers.types import ( @@ -19,26 +18,6 @@ from verifiers.utils.response_utils import parse_response_message -class _RecordingCreate: - def __init__(self, response: Any) -> None: - self.response = response - self.calls: list[dict[str, Any]] = [] - - async def create(self, **kwargs: Any) -> Any: - self.calls.append(kwargs) - return self.response - - -def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]: - recorder = _RecordingCreate(response) - return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder - - -def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]: - recorder = _RecordingCreate(response) - return SimpleNamespace(messages=recorder), recorder - - @pytest.mark.asyncio async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): client = OpenAIChatCompletionsClient(object()) @@ -73,79 +52,6 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts(): ] -@pytest.mark.asyncio -@pytest.mark.parametrize( - ("model", "effort"), - [ - ("anthropic/claude-opus-4.7", "xhigh"), - ("anthropic/claude-sonnet-4.6", "max"), - ], -) -async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity( - model: str, effort: str -): - recording_client, recorder = _recording_openai(SimpleNamespace()) - client = OpenAIChatCompletionsClient(recording_client) - client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1") - - response = await client.get_native_response( - prompt=[], - model=model, - sampling_args={ - "n": 1, - "reasoning_effort": effort, - "extra_body": {"reasoning": {"enabled": True}}, - }, - ) - - assert response is recorder.response - call = recorder.calls[0] - assert "reasoning_effort" not in call - assert call["extra_body"] == { - "reasoning": {"enabled": True}, - "verbosity": effort, - } - - -@pytest.mark.asyncio -async def test_openrouter_anthropic_reasoning_effort_enables_reasoning(): - recording_client, recorder = _recording_openai(SimpleNamespace()) - client = OpenAIChatCompletionsClient(recording_client) - client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1") - - await client.get_native_response( - prompt=[], - model="anthropic/claude-opus-4.7", - sampling_args={"reasoning_effort": "high"}, - ) - - call = recorder.calls[0] - assert call["extra_body"] == { - "reasoning": {"enabled": True}, - "verbosity": "high", - } - - -@pytest.mark.asyncio -async def test_openrouter_anthropic_reasoning_effort_maps_opus_4_5(): - recording_client, recorder = _recording_openai(SimpleNamespace()) - client = OpenAIChatCompletionsClient(recording_client) - client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1") - - await client.get_native_response( - prompt=[], - model="anthropic/claude-opus-4.5", - sampling_args={"reasoning_effort": "medium"}, - ) - - call = recorder.calls[0] - assert "reasoning_effort" not in call - assert call["extra_body"] == { - "reasoning": {"enabled": True}, - "verbosity": "medium", - } - - @pytest.mark.asyncio async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts(): pytest.importorskip("anthropic") @@ -245,77 +151,6 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag ] -@pytest.mark.asyncio -@pytest.mark.parametrize( - ("model", "effort"), - [("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")], -) -async def test_anthropic_reasoning_effort_maps_to_output_config( - model: str, effort: str -): - pytest.importorskip("anthropic") - from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient - - recording_client, recorder = _recording_anthropic(SimpleNamespace()) - client = AnthropicMessagesClient(recording_client) - - response = await client.get_native_response( - prompt=[], - model=model, - sampling_args={"max_tokens": 128, "reasoning_effort": effort}, - ) - - assert response is recorder.response - call = recorder.calls[0] - assert "reasoning_effort" not in call - assert call["output_config"] == {"effort": effort} - assert call["thinking"] == {"type": "adaptive"} - - -@pytest.mark.asyncio -async def test_anthropic_opus_4_5_uses_output_config_without_adaptive_thinking(): - pytest.importorskip("anthropic") - from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient - - recording_client, recorder = _recording_anthropic(SimpleNamespace()) - client = AnthropicMessagesClient(recording_client) - - await client.get_native_response( - prompt=[], - model="claude-opus-4-5", - sampling_args={"max_tokens": 4096, "reasoning_effort": "medium"}, - ) - - call = recorder.calls[0] - assert call["output_config"] == {"effort": "medium"} - assert "thinking" not in call - - -@pytest.mark.asyncio -async def test_anthropic_reasoning_effort_preserves_existing_output_config(): - pytest.importorskip("anthropic") - from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient - - recording_client, recorder = _recording_anthropic(SimpleNamespace()) - client = AnthropicMessagesClient(recording_client) - - await client.get_native_response( - prompt=[], - model="claude-opus-4-7", - sampling_args={ - "max_tokens": 128, - "reasoning_effort": "high", - "output_config": {"format": {"type": "text"}}, - }, - ) - - call = recorder.calls[0] - assert call["output_config"] == { - "format": {"type": "text"}, - "effort": "high", - } - - @pytest.mark.asyncio async def test_anthropic_from_native_response_extracts_usage(): anthropic = pytest.importorskip("anthropic")