Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions tests/test_client_multimodal_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from types import SimpleNamespace
from typing import Any

from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
from verifiers.types import (
Expand All @@ -18,6 +19,26 @@
from verifiers.utils.response_utils import parse_response_message


class _RecordingCreate:
def __init__(self, response: Any) -> None:
self.response = response
self.calls: list[dict[str, Any]] = []

async def create(self, **kwargs: Any) -> Any:
self.calls.append(kwargs)
return self.response


def _recording_openai(response: Any) -> tuple[Any, _RecordingCreate]:
recorder = _RecordingCreate(response)
return SimpleNamespace(chat=SimpleNamespace(completions=recorder)), recorder


def _recording_anthropic(response: Any) -> tuple[Any, _RecordingCreate]:
recorder = _RecordingCreate(response)
return SimpleNamespace(messages=recorder), recorder


@pytest.mark.asyncio
async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
client = OpenAIChatCompletionsClient(object())
Expand Down Expand Up @@ -52,6 +73,59 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
]


@pytest.mark.asyncio
@pytest.mark.parametrize(
("model", "effort"),
[
("anthropic/claude-opus-4.7", "xhigh"),
("anthropic/claude-sonnet-4.6", "max"),
],
)
async def test_openrouter_anthropic_reasoning_effort_maps_to_verbosity(
model: str, effort: str
):
recording_client, recorder = _recording_openai(SimpleNamespace())
client = OpenAIChatCompletionsClient(recording_client)
client._config = SimpleNamespace(api_base_url="https://openrouter.ai/api/v1")

response = await client.get_native_response(
prompt=[],
model=model,
sampling_args={
"n": 1,
"reasoning_effort": effort,
"extra_body": {"reasoning": {"enabled": True}},
},
)

assert response is recorder.response
call = recorder.calls[0]
assert "reasoning_effort" not in call
assert call["extra_body"] == {
"reasoning": {"enabled": True},
"verbosity": effort,
}


@pytest.mark.asyncio
async def test_openrouter_anthropic_reasoning_effort_enables_reasoning():
recording_client, recorder = _recording_openai(SimpleNamespace())
client = OpenAIChatCompletionsClient(recording_client)
client._config = SimpleNamespace(api_base_url="https://api.pinference.ai/api/v1")

await client.get_native_response(
prompt=[],
model="anthropic/claude-opus-4.7",
sampling_args={"reasoning_effort": "high"},
)

call = recorder.calls[0]
assert call["extra_body"] == {
"reasoning": {"enabled": True},
"verbosity": "high",
}


@pytest.mark.asyncio
async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
pytest.importorskip("anthropic")
Expand Down Expand Up @@ -151,6 +225,58 @@ async def test_anthropic_merges_consecutive_tool_results_into_single_user_messag
]


@pytest.mark.asyncio
@pytest.mark.parametrize(
("model", "effort"),
[("claude-opus-4-7", "xhigh"), ("claude-sonnet-4-6", "max")],
)
async def test_anthropic_reasoning_effort_maps_to_output_config(
model: str, effort: str
):
pytest.importorskip("anthropic")
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient

recording_client, recorder = _recording_anthropic(SimpleNamespace())
client = AnthropicMessagesClient(recording_client)

response = await client.get_native_response(
prompt=[],
model=model,
sampling_args={"max_tokens": 128, "reasoning_effort": effort},
)

assert response is recorder.response
call = recorder.calls[0]
assert "reasoning_effort" not in call
assert call["output_config"] == {"effort": effort}
assert call["thinking"] == {"type": "adaptive"}


@pytest.mark.asyncio
async def test_anthropic_reasoning_effort_preserves_existing_output_config():
pytest.importorskip("anthropic")
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient

recording_client, recorder = _recording_anthropic(SimpleNamespace())
client = AnthropicMessagesClient(recording_client)

await client.get_native_response(
prompt=[],
model="claude-opus-4-7",
sampling_args={
"max_tokens": 128,
"reasoning_effort": "high",
"output_config": {"format": {"type": "text"}},
},
)

call = recorder.calls[0]
assert call["output_config"] == {
"format": {"type": "text"},
"effort": "high",
}


@pytest.mark.asyncio
async def test_anthropic_from_native_response_extracts_usage():
anthropic = pytest.importorskip("anthropic")
Expand Down
14 changes: 14 additions & 0 deletions verifiers/clients/anthropic_messages_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,20 @@ async def get_native_response(
) -> AnthropicMessage:
def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
sampling_args = dict(sampling_args)
reasoning_effort = sampling_args.pop("reasoning_effort", None)
model_id = model.lower().replace(".", "-").replace("_", "-")
if reasoning_effort is not None and (
model_id.startswith("anthropic/") or model_id.startswith("claude-")
):
output_config = dict(sampling_args.get("output_config") or {})
output_config["effort"] = reasoning_effort
sampling_args["output_config"] = output_config
if "thinking" not in sampling_args and (
"4-7" in model_id or "4-6" in model_id
):
sampling_args["thinking"] = {"type": "adaptive"}
elif reasoning_effort is not None:
sampling_args["reasoning_effort"] = reasoning_effort
max_tokens = sampling_args.pop("max_tokens", None)
sampling_args.pop("n", None)
sampling_args.pop("stop", None)
Expand Down
23 changes: 23 additions & 0 deletions verifiers/clients/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,29 @@ async def get_native_response(
) -> OpenAIChatResponse:
def normalize_sampling_args(sampling_args: SamplingArgs):
sampling_args = dict(sampling_args)
api_base_url = None
if hasattr(self.client, "base_url"):
api_base_url = str(self.client.base_url)
elif self._config is not None:
api_base_url = self._config.api_base_url
reasoning_effort = sampling_args.pop("reasoning_effort", None)
model_id = model.lower().replace(".", "-").replace("_", "-")
api_base_url = (api_base_url or "").lower()
if (
reasoning_effort is not None
and (
model_id.startswith("anthropic/") or model_id.startswith("claude-")
)
and ("openrouter.ai" in api_base_url or "pinference.ai" in api_base_url)
):
extra_body = dict(sampling_args.get("extra_body") or {})
extra_body["verbosity"] = reasoning_effort
reasoning = dict(extra_body.get("reasoning") or {})
reasoning.setdefault("enabled", True)
extra_body["reasoning"] = reasoning
sampling_args["extra_body"] = extra_body
elif reasoning_effort is not None:
sampling_args["reasoning_effort"] = reasoning_effort
if "max_tokens" in sampling_args:
sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
return {k: v for k, v in sampling_args.items() if v is not None}
Expand Down
Loading