diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 0984498..b2c181a 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -795,6 +795,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None: emit_special=emit_special, emit_text=emit_text, emit_ids=emit_ids, + emit_image=emit_image, ) elif msg.get("content") is not None: # User / other content branches — images allowed. @@ -956,6 +957,7 @@ def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None: emit_special=emit_special, emit_text=emit_text, emit_ids=emit_ids, + emit_image=emit_image, ) elif msg.get("content") is not None: self._emit_content( @@ -1168,6 +1170,7 @@ def _render_tool_body( emit_special, emit_text, emit_ids, + emit_image=None, ) -> None: """Emit tool-result body (after the role tag): ``## Return of {id}\\n`` + content. No ``<|im_end|>``; caller emits that. @@ -1175,12 +1178,24 @@ def _render_tool_body( The K2.5 template emits the ``## Return of …`` header unconditionally — when ``tool_call_id`` is missing the interpolation yields empty string and you get ``## Return of \\n``. We mirror that. + + ``emit_image`` is threaded through to ``_emit_content`` so image parts + inside ``content`` lists (browser-agent screenshots, etc.) render as + ``<|media_begin|>image<|media_content|><|media_pad|><|media_end|>`` + inline. Without it those parts are silently dropped. """ tool_call_id = msg.get("tool_call_id") or "" emit_text(f"## Return of {tool_call_id}\n", msg_idx) content = msg.get("content") if content is not None: - self._emit_content(content, msg_idx, emit_special, emit_text, emit_ids) + self._emit_content( + content, + msg_idx, + emit_special, + emit_text, + emit_ids, + emit_image=emit_image, + ) def _normalize_response_tokens(self, response: list[int]) -> list[int]: """Restore the synthetic ```` prefill if the sampler stripped it. diff --git a/renderers/qwen35.py b/renderers/qwen35.py index 05f155e..11a16b8 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -439,9 +439,9 @@ def flush_buf() -> None: self._render_tool( messages, i, - content, emit_special=emit_special, emit_text=emit_text, + emit_image=emit_image, ) else: @@ -539,7 +539,7 @@ def emit_special(token_id: int, _msg_idx: int = -1) -> None: def emit_text(text: str, _msg_idx: int = -1) -> None: tokens.extend(self._encode(text)) - def emit_image(part: dict[str, Any]) -> None: + def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None: _, out, n, h = self._process_image(part) emit_special(self._vision_start) offset = len(tokens) @@ -613,9 +613,9 @@ def flush_buf() -> None: self._render_tool( new_messages, i, - content, emit_special=emit_special, emit_text=emit_text, + emit_image=emit_image, ) else: return None @@ -793,16 +793,20 @@ def _render_tool( self, messages: list[Message], msg_idx: int, - content: str, *, emit_special, emit_text, + emit_image, ) -> None: - # Consecutive tool messages are grouped under a single <|im_start|>user block + # Consecutive tool messages share a single <|im_start|>user ... <|im_end|> + # envelope. Whether to open and close the envelope depends only on the + # neighbouring roles, never on the content type of this or any other + # tool message — keep this predicate text/media-agnostic. prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool" next_is_tool = ( msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool" ) + raw_content = messages[msg_idx].get("content") if not prev_is_tool: emit_special(self._im_start, msg_idx) @@ -810,7 +814,45 @@ def _render_tool( emit_text("\n", msg_idx) emit_special(self._tool_response, msg_idx) - emit_text("\n" + content + "\n", msg_idx) + + if self._content_has_media(raw_content): + # Mirror the chat template's ``render_content`` macro for list + # content: text segments BPE-encode together up to a media + # boundary, then each image emits ``<|vision_start|>`` + N× + # ``<|image_pad|>`` + ``<|vision_end|>`` inline. + # ``_content_has_media`` returns False unless content is a list, + # but the type checker can't follow that through the call. + assert isinstance(raw_content, list) + buf: list[str] = ["\n"] + + def flush_buf() -> None: + if buf: + emit_text("".join(buf), msg_idx) + buf.clear() + + for item in raw_content: + if isinstance(item, str): + buf.append(item) + elif isinstance(item, dict): + if _is_image_part(item): + flush_buf() + emit_image(item, msg_idx) + elif _is_video_part(item): + raise NotImplementedError( + "Video parts are not yet supported by Qwen35Renderer." + ) + elif "text" in item: + buf.append(item["text"]) + else: + raise ValueError(f"Unexpected content item: {item}") + else: + raise ValueError(f"Unexpected content item: {item}") + flush_buf() + emit_text("\n", msg_idx) + else: + content = self._render_content(raw_content).strip() + emit_text("\n" + content + "\n", msg_idx) + emit_special(self._tool_response_end, msg_idx) if not next_is_tool: diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py index 35b6cae..825781f 100644 --- a/tests/test_multimodal.py +++ b/tests/test_multimodal.py @@ -299,11 +299,142 @@ def _build_cases(make_part, image): ] +def _build_tool_image_cases(make_part, image): + """Tool-message image scenarios. Targets renderers that emit image + placeholders inside ```` blocks. Browser-agent style + trajectories produce post-action screenshots as tool responses, so + handling images here is load-bearing for that workload.""" + return [ + pytest.param( + [ + {"role": "user", "content": "Take a screenshot."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "screenshot", "arguments": {}}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": [ + {"type": "text", "text": "Screenshot captured."}, + make_part(image), + ], + }, + ], + False, + id="tool_response_with_image", + ), + pytest.param( + [ + {"role": "user", "content": "Screenshot then describe."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "screenshot", "arguments": {}}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": [ + {"type": "text", "text": "Done:"}, + make_part(image), + ], + }, + {"role": "assistant", "content": "A square."}, + {"role": "user", "content": "Now show me the next page."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c2", + "type": "function", + "function": {"name": "screenshot", "arguments": {}}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c2", + "content": [ + {"type": "text", "text": "Next page:"}, + make_part(image), + ], + }, + ], + False, + id="multi_turn_tool_response_images", + ), + pytest.param( + [ + {"role": "user", "content": "Run a few tools."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "ping", "arguments": {}}, + }, + { + "id": "c2", + "type": "function", + "function": {"name": "screenshot", "arguments": {}}, + }, + { + "id": "c3", + "type": "function", + "function": {"name": "ping", "arguments": {}}, + }, + ], + }, + {"role": "tool", "tool_call_id": "c1", "content": "pong"}, + { + "role": "tool", + "tool_call_id": "c2", + "content": [ + {"type": "text", "text": "Screenshot:"}, + make_part(image), + ], + }, + {"role": "tool", "tool_call_id": "c3", "content": "pong"}, + ], + False, + id="consecutive_tools_mixed_media", + ), + ] + + # --------------------------------------------------------------------------- # Tests. # --------------------------------------------------------------------------- +def _supports_tool_message_images(renderer) -> bool: + """True iff this renderer emits image placeholders inside tool-response + content. Renderers without the feature silently drop image parts in tool + content; as they grow the feature they get added here and the test starts + asserting against them.""" + from renderers.kimi_k25 import KimiK25Renderer + from renderers.qwen35 import Qwen35Renderer + + return isinstance(renderer, (Qwen35Renderer, KimiK25Renderer)) + + @pytest.mark.parametrize( "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] ) @@ -513,6 +644,44 @@ def test_modality_registry_models_route_to_renderer(): ) +@pytest.mark.parametrize( + "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] +) +def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image): + """Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``. + + Browser-agent SFT traces carry post-action screenshots as ``tool`` + responses. Renderers that drop those image parts silently — historically + every Qwen-VL family renderer did — produce token streams that diverge + from the HF processor and lose most of the visual learning signal. + Skipped for renderers that haven't grown the feature yet; flips to a + real assertion as they do. + """ + if modality != "image": + pytest.skip("Tool-response media path is image-only for now.") + if not _hf_snapshot_cached(mm_model_name): + pytest.skip(f"{mm_model_name}: HF snapshot not cached locally") + + kit = _modality_kit(modality, mm_model_name) + tokenizer, processor, renderer = _load_processor_and_renderer(mm_model_name) + + if not _supports_tool_message_images(renderer): + pytest.skip( + f"{type(renderer).__name__} does not yet emit images inside tool responses" + ) + + for case in _build_tool_image_cases(kit["make_part"], tiny_image): + messages, add_gp = case.values + ours = renderer.render_ids(messages, add_generation_prompt=add_gp) + theirs = kit["processor_input_ids"](processor, messages, add_gp) + assert ours == theirs, ( + f"{mm_model_name} / tool / case={case.id}: " + f"renderer diverges from processor.\n" + f" len(ours)={len(ours)} len(theirs)={len(theirs)}\n" + f" ours[:60]={ours[:60]}\n theirs[:60]={theirs[:60]}" + ) + + def test_qwen3_vl_renderer_exposes_image_modality(): """The flagship multimodal renderer is concretely Qwen3VLRenderer.