Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion renderers/kimi_k25.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None:
emit_special=emit_special,
emit_text=emit_text,
emit_ids=emit_ids,
emit_image=emit_image,
)
elif msg.get("content") is not None:
# User / other content branches — images allowed.
Expand Down Expand Up @@ -956,6 +957,7 @@ def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
emit_special=emit_special,
emit_text=emit_text,
emit_ids=emit_ids,
emit_image=emit_image,
)
elif msg.get("content") is not None:
self._emit_content(
Expand Down Expand Up @@ -1168,19 +1170,32 @@ def _render_tool_body(
emit_special,
emit_text,
emit_ids,
emit_image=None,
) -> None:
"""Emit tool-result body (after the role tag): ``## Return of {id}\\n``
+ content. No ``<|im_end|>``; caller emits that.

The K2.5 template emits the ``## Return of …`` header unconditionally
— when ``tool_call_id`` is missing the interpolation yields empty
string and you get ``## Return of \\n``. We mirror that.

``emit_image`` is threaded through to ``_emit_content`` so image parts
inside ``content`` lists (browser-agent screenshots, etc.) render as
``<|media_begin|>image<|media_content|><|media_pad|><|media_end|>``
inline. Without it those parts are silently dropped.
"""
tool_call_id = msg.get("tool_call_id") or ""
emit_text(f"## Return of {tool_call_id}\n", msg_idx)
content = msg.get("content")
if content is not None:
self._emit_content(content, msg_idx, emit_special, emit_text, emit_ids)
self._emit_content(
content,
msg_idx,
emit_special,
emit_text,
emit_ids,
emit_image=emit_image,
)

def _normalize_response_tokens(self, response: list[int]) -> list[int]:
"""Restore the synthetic ``<think>`` prefill if the sampler stripped it.
Expand Down
54 changes: 48 additions & 6 deletions renderers/qwen35.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,9 @@ def flush_buf() -> None:
self._render_tool(
messages,
i,
content,
emit_special=emit_special,
emit_text=emit_text,
emit_image=emit_image,
)

else:
Expand Down Expand Up @@ -539,7 +539,7 @@ def emit_special(token_id: int, _msg_idx: int = -1) -> None:
def emit_text(text: str, _msg_idx: int = -1) -> None:
tokens.extend(self._encode(text))

def emit_image(part: dict[str, Any]) -> None:
def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
_, out, n, h = self._process_image(part)
emit_special(self._vision_start)
offset = len(tokens)
Expand Down Expand Up @@ -613,9 +613,9 @@ def flush_buf() -> None:
self._render_tool(
new_messages,
i,
content,
emit_special=emit_special,
emit_text=emit_text,
emit_image=emit_image,
)
else:
return None
Expand Down Expand Up @@ -793,24 +793,66 @@ def _render_tool(
self,
messages: list[Message],
msg_idx: int,
content: str,
*,
emit_special,
emit_text,
emit_image,
) -> None:
# Consecutive tool messages are grouped under a single <|im_start|>user block
# Consecutive tool messages share a single <|im_start|>user ... <|im_end|>
# envelope. Whether to open and close the envelope depends only on the
# neighbouring roles, never on the content type of this or any other
# tool message — keep this predicate text/media-agnostic.
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
next_is_tool = (
msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
)
raw_content = messages[msg_idx].get("content")

if not prev_is_tool:
emit_special(self._im_start, msg_idx)
emit_text("user", msg_idx)

emit_text("\n", msg_idx)
emit_special(self._tool_response, msg_idx)
emit_text("\n" + content + "\n", msg_idx)

if self._content_has_media(raw_content):
# Mirror the chat template's ``render_content`` macro for list
# content: text segments BPE-encode together up to a media
# boundary, then each image emits ``<|vision_start|>`` + N×
# ``<|image_pad|>`` + ``<|vision_end|>`` inline.
# ``_content_has_media`` returns False unless content is a list,
# but the type checker can't follow that through the call.
assert isinstance(raw_content, list)
buf: list[str] = ["\n"]

def flush_buf() -> None:
if buf:
emit_text("".join(buf), msg_idx)
buf.clear()

for item in raw_content:
if isinstance(item, str):
buf.append(item)
elif isinstance(item, dict):
if _is_image_part(item):
flush_buf()
emit_image(item, msg_idx)
elif _is_video_part(item):
raise NotImplementedError(
"Video parts are not yet supported by Qwen35Renderer."
)
elif "text" in item:
buf.append(item["text"])
else:
raise ValueError(f"Unexpected content item: {item}")
else:
raise ValueError(f"Unexpected content item: {item}")
flush_buf()
emit_text("\n", msg_idx)
else:
content = self._render_content(raw_content).strip()
emit_text("\n" + content + "\n", msg_idx)

emit_special(self._tool_response_end, msg_idx)

if not next_is_tool:
Expand Down
169 changes: 169 additions & 0 deletions tests/test_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,142 @@ def _build_cases(make_part, image):
]


def _build_tool_image_cases(make_part, image):
"""Tool-message image scenarios. Targets renderers that emit image
placeholders inside ``<tool_response>`` blocks. Browser-agent style
trajectories produce post-action screenshots as tool responses, so
handling images here is load-bearing for that workload."""
return [
pytest.param(
[
{"role": "user", "content": "Take a screenshot."},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c1",
"type": "function",
"function": {"name": "screenshot", "arguments": {}},
}
],
},
{
"role": "tool",
"tool_call_id": "c1",
"content": [
{"type": "text", "text": "Screenshot captured."},
make_part(image),
],
},
],
False,
id="tool_response_with_image",
),
pytest.param(
[
{"role": "user", "content": "Screenshot then describe."},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c1",
"type": "function",
"function": {"name": "screenshot", "arguments": {}},
}
],
},
{
"role": "tool",
"tool_call_id": "c1",
"content": [
{"type": "text", "text": "Done:"},
make_part(image),
],
},
{"role": "assistant", "content": "A square."},
{"role": "user", "content": "Now show me the next page."},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c2",
"type": "function",
"function": {"name": "screenshot", "arguments": {}},
}
],
},
{
"role": "tool",
"tool_call_id": "c2",
"content": [
{"type": "text", "text": "Next page:"},
make_part(image),
],
},
],
False,
id="multi_turn_tool_response_images",
),
pytest.param(
[
{"role": "user", "content": "Run a few tools."},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c1",
"type": "function",
"function": {"name": "ping", "arguments": {}},
},
{
"id": "c2",
"type": "function",
"function": {"name": "screenshot", "arguments": {}},
},
{
"id": "c3",
"type": "function",
"function": {"name": "ping", "arguments": {}},
},
],
},
{"role": "tool", "tool_call_id": "c1", "content": "pong"},
{
"role": "tool",
"tool_call_id": "c2",
"content": [
{"type": "text", "text": "Screenshot:"},
make_part(image),
],
},
{"role": "tool", "tool_call_id": "c3", "content": "pong"},
],
False,
id="consecutive_tools_mixed_media",
),
]


# ---------------------------------------------------------------------------
# Tests.
# ---------------------------------------------------------------------------


def _supports_tool_message_images(renderer) -> bool:
"""True iff this renderer emits image placeholders inside tool-response
content. Renderers without the feature silently drop image parts in tool
content; as they grow the feature they get added here and the test starts
asserting against them."""
from renderers.kimi_k25 import KimiK25Renderer
from renderers.qwen35 import Qwen35Renderer

return isinstance(renderer, (Qwen35Renderer, KimiK25Renderer))


@pytest.mark.parametrize(
"mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
)
Expand Down Expand Up @@ -513,6 +644,44 @@ def test_modality_registry_models_route_to_renderer():
)


@pytest.mark.parametrize(
"mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
)
def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image):
"""Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``.

Browser-agent SFT traces carry post-action screenshots as ``tool``
responses. Renderers that drop those image parts silently — historically
every Qwen-VL family renderer did — produce token streams that diverge
from the HF processor and lose most of the visual learning signal.
Skipped for renderers that haven't grown the feature yet; flips to a
real assertion as they do.
"""
if modality != "image":
pytest.skip("Tool-response media path is image-only for now.")
if not _hf_snapshot_cached(mm_model_name):
pytest.skip(f"{mm_model_name}: HF snapshot not cached locally")

kit = _modality_kit(modality, mm_model_name)
tokenizer, processor, renderer = _load_processor_and_renderer(mm_model_name)

if not _supports_tool_message_images(renderer):
pytest.skip(
f"{type(renderer).__name__} does not yet emit images inside tool responses"
)

for case in _build_tool_image_cases(kit["make_part"], tiny_image):
messages, add_gp = case.values
ours = renderer.render_ids(messages, add_generation_prompt=add_gp)
theirs = kit["processor_input_ids"](processor, messages, add_gp)
assert ours == theirs, (
f"{mm_model_name} / tool / case={case.id}: "
f"renderer diverges from processor.\n"
f" len(ours)={len(ours)} len(theirs)={len(theirs)}\n"
f" ours[:60]={ours[:60]}\n theirs[:60]={theirs[:60]}"
)


def test_qwen3_vl_renderer_exposes_image_modality():
"""The flagship multimodal renderer is concretely Qwen3VLRenderer.

Expand Down
Loading