From 4241702260a5f8ec32200952c807892ae72c3798 Mon Sep 17 00:00:00 2001
From: Xeophon <46377542+xeophon@users.noreply.github.com>
Date: Mon, 11 May 2026 12:47:50 +0200
Subject: [PATCH 1/3] Map Anthropic reasoning effort by provider

Amp-Thread-ID: https://ampcode.com/threads/T-019e1682-ae38-76cc-bda3-d90d28b62198
Co-authored-by: Amp <amp@ampcode.com>
---
 tests/test_client_multimodal_types.py         | 126 ++++++++++++++++++
 .../clients/anthropic_messages_client.py      |  14 ++
 .../clients/openai_chat_completions_client.py |  23 ++++
 3 files changed, 163 insertions(+)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index d51c38262..38e67aaa9 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -1,5 +1,6 @@
 import pytest
 from types import SimpleNamespace
+from typing import Any
 
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
 from verifiers.types import (
@@ -18,6 +19,26 @@
 from verifiers.utils.response_utils import parse_response_message
 
 
+class _RecordingCreate:
+    def __init__(self, response: Any) -> None:
+        self.response = response
+        self.calls: list[dict[str, Any]] = []
+
+    async def create(self, **kwargs: Any) -> Any:
+        self.calls.append(kwargs)
+        return self.response
+
+
+def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]:
+    recorder = _RecordingCreate(response)
+    return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder
+
+
+def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]:
+    recorder = _RecordingCreate(response)
+    return SimpleNamespace(messages=recorder), recorder
+
+
 @pytest.mark.asyncio
 async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     client = OpenAIChatCompletionsClient(object())
@@ -52,6 +73,59 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     ]
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("model", "effort"),
+    [
+        ("anthropic/claude-opus-4.7", "xhigh"),
+        ("anthropic/claude-sonnet-4.6", "max"),
+    ],
+)
+async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity(
+    model: str, effort: str
+):
+    recording_client, recorder = _recording_openai(SimpleNamespace())
+    client = OpenAIChatCompletionsClient(recording_client)
+    client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1")
+
+    response = await client.get_native_response(
+        prompt=[],
+        model=model,
+        sampling_args={
+            "n": 1,
+            "reasoning_effort": effort,
+            "extra_body": {"reasoning": {"enabled": True}},
+        },
+    )
+
+    assert response is recorder.response
+    call = recorder.calls[0]
+    assert "reasoning_effort" not in call
+    assert call["extra_body"] == {
+        "reasoning": {"enabled": True},
+        "verbosity": effort,
+    }
+
+
+@pytest.mark.asyncio
+async def test_openrouter_anthropic_reasoning_effort_enables_reasoning():
+    recording_client, recorder = _recording_openai(SimpleNamespace())
+    client = OpenAIChatCompletionsClient(recording_client)
+    client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1")
+
+    await client.get_native_response(
+        prompt=[],
+        model="anthropic/claude-opus-4.7",
+        sampling_args={"reasoning_effort": "high"},
+    )
+
+    call = recorder.calls[0]
+    assert call["extra_body"] == {
+        "reasoning": {"enabled": True},
+        "verbosity": "high",
+    }
+
+
 @pytest.mark.asyncio
 async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
     pytest.importorskip("anthropic")
@@ -151,6 +225,58 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag
     ]
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("model", "effort"),
+    [("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")],
+)
+async def test_anthropic_reasoning_effort_maps_to_output_config(
+    model: str, effort: str
+):
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    recording_client, recorder = _recording_anthropic(SimpleNamespace())
+    client = AnthropicMessagesClient(recording_client)
+
+    response = await client.get_native_response(
+        prompt=[],
+        model=model,
+        sampling_args={"max_tokens": 128, "reasoning_effort": effort},
+    )
+
+    assert response is recorder.response
+    call = recorder.calls[0]
+    assert "reasoning_effort" not in call
+    assert call["output_config"] == {"effort": effort}
+    assert call["thinking"] == {"type": "adaptive"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_reasoning_effort_preserves_existing_output_config():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    recording_client, recorder = _recording_anthropic(SimpleNamespace())
+    client = AnthropicMessagesClient(recording_client)
+
+    await client.get_native_response(
+        prompt=[],
+        model="claude-opus-4-7",
+        sampling_args={
+            "max_tokens": 128,
+            "reasoning_effort": "high",
+            "output_config": {"format": {"type": "text"}},
+        },
+    )
+
+    call = recorder.calls[0]
+    assert call["output_config"] == {
+        "format": {"type": "text"},
+        "effort": "high",
+    }
+
+
 @pytest.mark.asyncio
 async def test_anthropic_from_native_response_extracts_usage():
     anthropic = pytest.importorskip("anthropic")
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 9e80b63b7..0f440dc0f 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -342,6 +342,20 @@ async def get_native_response(
     ) -> AnthropicMessage:
         def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
             sampling_args = dict(sampling_args)
+            reasoning_effort = sampling_args.pop("reasoning_effort", None)
+            model_id = model.lower().replace(".", "-").replace("_", "-")
+            if reasoning_effort is not None and (
+                model_id.startswith("anthropic/") or model_id.startswith("claude-")
+            ):
+                output_config = dict(sampling_args.get("output_config") or {})
+                output_config["effort"] = reasoning_effort
+                sampling_args["output_config"] = output_config
+                if "thinking" not in sampling_args and (
+                    "4-7" in model_id or "4-6" in model_id
+                ):
+                    sampling_args["thinking"] = {"type": "adaptive"}
+            elif reasoning_effort is not None:
+                sampling_args["reasoning_effort"] = reasoning_effort
             max_tokens = sampling_args.pop("max_tokens", None)
             sampling_args.pop("n", None)
             sampling_args.pop("stop", None)
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index c755d8dd4..c65cc90ad 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -252,6 +252,29 @@ async def get_native_response(
     ) -> OpenAIChatResponse:
         def normalize_sampling_args(sampling_args: SamplingArgs):
             sampling_args = dict(sampling_args)
+            api_base_url = None
+            if hasattr(self.client, "base_url"):
+                api_base_url = str(self.client.base_url)
+            elif self._config is not None:
+                api_base_url = self._config.api_base_url
+            reasoning_effort = sampling_args.pop("reasoning_effort", None)
+            model_id = model.lower().replace(".", "-").replace("_", "-")
+            api_base_url = (api_base_url or "").lower()
+            if (
+                reasoning_effort is not None
+                and (
+                    model_id.startswith("anthropic/") or model_id.startswith("claude-")
+                )
+                and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url)
+            ):
+                extra_body = dict(sampling_args.get("extra_body") or {})
+                extra_body["verbosity"] = reasoning_effort
+                reasoning = dict(extra_body.get("reasoning") or {})
+                reasoning.setdefault("enabled", True)
+                extra_body["reasoning"] = reasoning
+                sampling_args["extra_body"] = extra_body
+            elif reasoning_effort is not None:
+                sampling_args["reasoning_effort"] = reasoning_effort
             if "max_tokens" in sampling_args:
                 sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
             return {k: v for k, v in sampling_args.items() if v is not None}

From e727960aec3fec8f04f5797a956ec867ece050b3 Mon Sep 17 00:00:00 2001
From: Xeophon <46377542+xeophon@users.noreply.github.com>
Date: Tue, 12 May 2026 15:42:59 +0200
Subject: [PATCH 2/3] Simplify Anthropic reasoning effort mapping

---
 tests/test_client_multimodal_types.py         | 39 +++++++++++++++++++
 .../clients/anthropic_messages_client.py      | 23 +++++++----
 .../clients/openai_chat_completions_client.py | 14 ++++---
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index 38e67aaa9..ee701ffcc 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -126,6 +126,26 @@ async def test_openrouter_anthropic_reasoning_effort_enables_reasoning():
     }
 
 
+@pytest.mark.asyncio
+async def test_openrouter_anthropic_reasoning_effort_maps_opus_4_5():
+    recording_client, recorder = _recording_openai(SimpleNamespace())
+    client = OpenAIChatCompletionsClient(recording_client)
+    client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1")
+
+    await client.get_native_response(
+        prompt=[],
+        model="anthropic/claude-opus-4.5",
+        sampling_args={"reasoning_effort": "medium"},
+    )
+
+    call = recorder.calls[0]
+    assert "reasoning_effort" not in call
+    assert call["extra_body"] == {
+        "reasoning": {"enabled": True},
+        "verbosity": "medium",
+    }
+
+
 @pytest.mark.asyncio
 async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
     pytest.importorskip("anthropic")
@@ -252,6 +272,25 @@ async def test_anthropic_reasoning_effort_maps_to_output_config(
     assert call["thinking"] == {"type": "adaptive"}
 
 
+@pytest.mark.asyncio
+async def test_anthropic_opus_4_5_uses_output_config_without_adaptive_thinking():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    recording_client, recorder = _recording_anthropic(SimpleNamespace())
+    client = AnthropicMessagesClient(recording_client)
+
+    await client.get_native_response(
+        prompt=[],
+        model="claude-opus-4-5",
+        sampling_args={"max_tokens": 4096, "reasoning_effort": "medium"},
+    )
+
+    call = recorder.calls[0]
+    assert call["output_config"] == {"effort": "medium"}
+    assert "thinking" not in call
+
+
 @pytest.mark.asyncio
 async def test_anthropic_reasoning_effort_preserves_existing_output_config():
     pytest.importorskip("anthropic")
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 0f440dc0f..31611d81d 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -50,6 +50,13 @@
 from verifiers.utils.client_utils import setup_anthropic_client
 
 
+ANTHROPIC_ADAPTIVE_THINKING_MODELS = {
+    "claude-opus-4-7",
+    "claude-opus-4-6",
+    "claude-sonnet-4-6",
+}
+
+
 def _handle_anthropic_overlong_prompt(func):
     """Decorator to handle overlong prompt errors from the Anthropic API."""
 
@@ -343,19 +350,19 @@ async def get_native_response(
         def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
             sampling_args = dict(sampling_args)
             reasoning_effort = sampling_args.pop("reasoning_effort", None)
-            model_id = model.lower().replace(".", "-").replace("_", "-")
-            if reasoning_effort is not None and (
-                model_id.startswith("anthropic/") or model_id.startswith("claude-")
-            ):
+            if reasoning_effort is not None:
+                model_id = (
+                    model.lower().split("/")[-1].replace(".", "-").replace("_", "-")
+                )
                 output_config = dict(sampling_args.get("output_config") or {})
                 output_config["effort"] = reasoning_effort
                 sampling_args["output_config"] = output_config
-                if "thinking" not in sampling_args and (
-                    "4-7" in model_id or "4-6" in model_id
+                if "thinking" not in sampling_args and any(
+                    model_id == adaptive_model
+                    or model_id.startswith(f"{adaptive_model}-")
+                    for adaptive_model in ANTHROPIC_ADAPTIVE_THINKING_MODELS
                 ):
                     sampling_args["thinking"] = {"type": "adaptive"}
-            elif reasoning_effort is not None:
-                sampling_args["reasoning_effort"] = reasoning_effort
             max_tokens = sampling_args.pop("max_tokens", None)
             sampling_args.pop("n", None)
             sampling_args.pop("stop", None)
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index c65cc90ad..0e4ef5090 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -258,15 +258,17 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             elif self._config is not None:
                 api_base_url = self._config.api_base_url
             reasoning_effort = sampling_args.pop("reasoning_effort", None)
-            model_id = model.lower().replace(".", "-").replace("_", "-")
-            api_base_url = (api_base_url or "").lower()
+            model_id = model.lower().split("/")[-1].replace(".", "-").replace("_", "-")
+            is_anthropic_route = (
+                "openrouter.ai" in (api_base_url or "").lower()
+                or "pinference.ai" in (api_base_url or "").lower()
+            )
             if (
                 reasoning_effort is not None
-                and (
-                    model_id.startswith("anthropic/") or model_id.startswith("claude-")
-                )
-                and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url)
+                and model_id.startswith("claude-")
+                and is_anthropic_route
             ):
+                # OpenRouter/Pinference route Anthropic reasoning_effort through extra_body.
                 extra_body = dict(sampling_args.get("extra_body") or {})
                 extra_body["verbosity"] = reasoning_effort
                 reasoning = dict(extra_body.get("reasoning") or {})

From a4c1fc71bbf365d5acb2eb79c8d2516aa7f06a89 Mon Sep 17 00:00:00 2001
From: Xeophon <46377542+xeophon@users.noreply.github.com>
Date: Tue, 12 May 2026 15:44:53 +0200
Subject: [PATCH 3/3] Remove Anthropic reasoning effort tests

---
 tests/test_client_multimodal_types.py | 165 --------------------------
 1 file changed, 165 deletions(-)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index ee701ffcc..d51c38262 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -1,6 +1,5 @@
 import pytest
 from types import SimpleNamespace
-from typing import Any
 
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
 from verifiers.types import (
@@ -19,26 +18,6 @@
 from verifiers.utils.response_utils import parse_response_message
 
 
-class _RecordingCreate:
-    def __init__(self, response: Any) -> None:
-        self.response = response
-        self.calls: list[dict[str, Any]] = []
-
-    async def create(self, **kwargs: Any) -> Any:
-        self.calls.append(kwargs)
-        return self.response
-
-
-def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]:
-    recorder = _RecordingCreate(response)
-    return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder
-
-
-def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]:
-    recorder = _RecordingCreate(response)
-    return SimpleNamespace(messages=recorder), recorder
-
-
 @pytest.mark.asyncio
 async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     client = OpenAIChatCompletionsClient(object())
@@ -73,79 +52,6 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     ]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    ("model", "effort"),
-    [
-        ("anthropic/claude-opus-4.7", "xhigh"),
-        ("anthropic/claude-sonnet-4.6", "max"),
-    ],
-)
-async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity(
-    model: str, effort: str
-):
-    recording_client, recorder = _recording_openai(SimpleNamespace())
-    client = OpenAIChatCompletionsClient(recording_client)
-    client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1")
-
-    response = await client.get_native_response(
-        prompt=[],
-        model=model,
-        sampling_args={
-            "n": 1,
-            "reasoning_effort": effort,
-            "extra_body": {"reasoning": {"enabled": True}},
-        },
-    )
-
-    assert response is recorder.response
-    call = recorder.calls[0]
-    assert "reasoning_effort" not in call
-    assert call["extra_body"] == {
-        "reasoning": {"enabled": True},
-        "verbosity": effort,
-    }
-
-
-@pytest.mark.asyncio
-async def test_openrouter_anthropic_reasoning_effort_enables_reasoning():
-    recording_client, recorder = _recording_openai(SimpleNamespace())
-    client = OpenAIChatCompletionsClient(recording_client)
-    client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1")
-
-    await client.get_native_response(
-        prompt=[],
-        model="anthropic/claude-opus-4.7",
-        sampling_args={"reasoning_effort": "high"},
-    )
-
-    call = recorder.calls[0]
-    assert call["extra_body"] == {
-        "reasoning": {"enabled": True},
-        "verbosity": "high",
-    }
-
-
-@pytest.mark.asyncio
-async def test_openrouter_anthropic_reasoning_effort_maps_opus_4_5():
-    recording_client, recorder = _recording_openai(SimpleNamespace())
-    client = OpenAIChatCompletionsClient(recording_client)
-    client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1")
-
-    await client.get_native_response(
-        prompt=[],
-        model="anthropic/claude-opus-4.5",
-        sampling_args={"reasoning_effort": "medium"},
-    )
-
-    call = recorder.calls[0]
-    assert "reasoning_effort" not in call
-    assert call["extra_body"] == {
-        "reasoning": {"enabled": True},
-        "verbosity": "medium",
-    }
-
-
 @pytest.mark.asyncio
 async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
     pytest.importorskip("anthropic")
@@ -245,77 +151,6 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag
     ]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    ("model", "effort"),
-    [("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")],
-)
-async def test_anthropic_reasoning_effort_maps_to_output_config(
-    model: str, effort: str
-):
-    pytest.importorskip("anthropic")
-    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
-
-    recording_client, recorder = _recording_anthropic(SimpleNamespace())
-    client = AnthropicMessagesClient(recording_client)
-
-    response = await client.get_native_response(
-        prompt=[],
-        model=model,
-        sampling_args={"max_tokens": 128, "reasoning_effort": effort},
-    )
-
-    assert response is recorder.response
-    call = recorder.calls[0]
-    assert "reasoning_effort" not in call
-    assert call["output_config"] == {"effort": effort}
-    assert call["thinking"] == {"type": "adaptive"}
-
-
-@pytest.mark.asyncio
-async def test_anthropic_opus_4_5_uses_output_config_without_adaptive_thinking():
-    pytest.importorskip("anthropic")
-    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
-
-    recording_client, recorder = _recording_anthropic(SimpleNamespace())
-    client = AnthropicMessagesClient(recording_client)
-
-    await client.get_native_response(
-        prompt=[],
-        model="claude-opus-4-5",
-        sampling_args={"max_tokens": 4096, "reasoning_effort": "medium"},
-    )
-
-    call = recorder.calls[0]
-    assert call["output_config"] == {"effort": "medium"}
-    assert "thinking" not in call
-
-
-@pytest.mark.asyncio
-async def test_anthropic_reasoning_effort_preserves_existing_output_config():
-    pytest.importorskip("anthropic")
-    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
-
-    recording_client, recorder = _recording_anthropic(SimpleNamespace())
-    client = AnthropicMessagesClient(recording_client)
-
-    await client.get_native_response(
-        prompt=[],
-        model="claude-opus-4-7",
-        sampling_args={
-            "max_tokens": 128,
-            "reasoning_effort": "high",
-            "output_config": {"format": {"type": "text"}},
-        },
-    )
-
-    call = recorder.calls[0]
-    assert call["output_config"] == {
-        "format": {"type": "text"},
-        "effort": "high",
-    }
-
-
 @pytest.mark.asyncio
 async def test_anthropic_from_native_response_extracts_usage():
     anthropic = pytest.importorskip("anthropic")