From a6c6ee43abb60b182a94a48340d563f60ab78c90 Mon Sep 17 00:00:00 2001 From: Carlos Bribiescas Date: Mon, 25 May 2026 16:41:45 +0800 Subject: [PATCH 1/3] fix(gpt-oss): route harmony prompts through openai-harmony MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses #568. GPT-OSS prompts go through ``tokenizer.apply_chat_template`` today, but ``api.utils.extract_multimodal_content()`` text-flattens prior assistant ``tool_calls`` to ``[Calling tool: X(args)]`` bracket strings when the active tool parser doesn't preserve native format. That breaks multi-turn agentic workloads on GPT-OSS — the model sees a malformed conversation and falls back to one-shot final-channel responses (avg_turns ~1, no tool use). This patch adds a separate prompt-rendering path that, only when active, routes through OpenAI's canonical ``openai-harmony`` renderer instead of the Jinja template. The renderer accepts structured ``Conversation`` objects (messages + tool definitions) and emits the wire format the GPT-OSS weights were trained on, sidestepping the text-flattening upstream entirely. Per @Thump604's suggested patch shape on #568: 1. **Regression test** — ``tests/test_harmony_render.py`` (9 tests) covers the multi-turn assistant-tool/tool-result rendering, section order, channel placement, generation-prompt suffix, and explicitly asserts the bracket-text fallback NEVER appears. 2. **Scoped narrowly** — engine reads ``self.use_harmony_rendering`` (default ``False``); set by the server only when ``--tool-call-parser harmony``/``gpt-oss`` is active AND ``openai-harmony`` is importable. The flag controls one ``if`` at the prompt-render site; non-harmony parsers and templates are untouched. 3. **Routed through openai-harmony** — new ``vllm_mlx/utils/harmony_render.py`` converts OpenAI-format messages + tools to a ``Conversation`` and calls ``HarmonyEncoding.render_conversation_for_completion``. Tool-call-id -> function-name resolution is done locally so the OpenAI ``role=tool`` shape (which lacks the function name field) flows through correctly. 4. **Non-harmony behavior unchanged** — when the harmony path is inactive (parser is anything other than harmony/gpt-oss, OR the optional ``openai-harmony`` package is not installed) the existing ``apply_chat_template`` codepath runs verbatim. The system-prefix KV cache probe (which assumes the Jinja path) is gated off for harmony requests so the probe/actual-prompt strings can't desynchronize. ``openai-harmony`` is added as an optional extra (``pip install vllm-mlx[harmony]``) so the default install footprint is unchanged. All 124 existing parser tests still pass alongside the 9 new harmony tests. --- pyproject.toml | 7 + tests/test_harmony_render.py | 226 ++++++++++++++++++++++++ vllm_mlx/engine/simple.py | 44 ++++- vllm_mlx/server.py | 41 +++++ vllm_mlx/utils/harmony_render.py | 286 +++++++++++++++++++++++++++++++ 5 files changed, 596 insertions(+), 8 deletions(-) create mode 100644 tests/test_harmony_render.py create mode 100644 vllm_mlx/utils/harmony_render.py diff --git a/pyproject.toml b/pyproject.toml index 3bfe353b1..05efe7b62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,13 @@ vision = [ "torch>=2.8.0", "torchvision>=0.23.0", ] +# Harmony prompt rendering for GPT-OSS models (see #568). With this installed +# and ``--tool-call-parser harmony``/``gpt-oss`` set, the engine routes prompt +# building through OpenAI's canonical renderer instead of the Jinja chat +# template (bypasses the bracket-text fallback that loses ``tool_calls``). +harmony = [ + "openai-harmony>=0.0.8", +] # Audio dependencies for TTS/STT (mlx-audio) audio = [ "mlx-audio>=0.2.9", diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py new file mode 100644 index 000000000..2b6c747ff --- /dev/null +++ b/tests/test_harmony_render.py @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Regression tests for ``vllm_mlx.utils.harmony_render``. + +Targets the harmony rendering path enabled when ``--tool-call-parser harmony`` +is active (see :issue:`568`). The tests check the wire-level output for the +shapes a multi-turn assistant-tool/tool-result conversation should produce, +matching what GPT-OSS was trained on. + +These tests run only when the optional ``openai-harmony`` package is +installed; skipped otherwise. +""" + +from __future__ import annotations + +import pytest + +from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages + +pytestmark = pytest.mark.skipif( + not HAS_HARMONY, + reason="openai-harmony not installed; harmony rendering is optional", +) + + +TOOLS = [ + { + "type": "function", + "function": { + "name": "read_file", + "description": "Read a file in the sandbox", + "parameters": { + "type": "object", + "properties": {"path": {"type": "string"}}, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "run_command", + "description": "Run a shell command in the sandbox", + "parameters": { + "type": "object", + "properties": {"cmd": {"type": "string"}}, + "required": ["cmd"], + }, + }, + }, +] + + +class TestHarmonyRender: + def test_single_turn_user_message(self): + prompt = render_messages( + [{"role": "user", "content": "Hello."}], + tools=None, + ) + assert "<|start|>system" in prompt + assert "<|start|>user<|message|>Hello.<|end|>" in prompt + assert prompt.endswith("<|start|>assistant") + + def test_developer_block_renders_tool_namespace(self): + prompt = render_messages( + [{"role": "user", "content": "List files."}], + tools=TOOLS, + ) + assert "<|start|>developer<|message|>" in prompt + assert "namespace functions" in prompt + # Both tool schemas should appear + assert "type read_file = (_:" in prompt + assert "type run_command = (_:" in prompt + + def test_section_order_system_developer_user(self): + """System must precede developer must precede the first user turn.""" + prompt = render_messages( + [ + {"role": "system", "content": "You are a coding assistant."}, + {"role": "user", "content": "Find the bug."}, + ], + tools=TOOLS, + ) + sys_pos = prompt.index("<|start|>system") + dev_pos = prompt.index("<|start|>developer") + user_pos = prompt.index("<|start|>user") + assert sys_pos < dev_pos < user_pos + + def test_assistant_tool_call_renders_commentary_channel(self): + """Assistant ``tool_calls`` must render in the commentary channel + addressed to ``functions.`` (the format the model was trained + on). The OLD text-flattening (``[Calling tool: …]``) must NOT appear. + """ + prompt = render_messages( + [ + {"role": "user", "content": "Find the bug in foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + {"role": "user", "content": "Continue."}, + ], + tools=TOOLS, + ) + assert "to=functions.run_command" in prompt + assert "<|channel|>commentary" in prompt + assert "<|call|>" in prompt + # Bracket-text fallback must NOT appear (that's the bug we're fixing). + assert "[Calling tool:" not in prompt + + def test_tool_result_renders_with_function_name(self): + """``role=tool`` messages need to come back addressed from + ``functions. to=assistant`` — the function name is resolved + by tracing the most recent assistant ``tool_call_id``. + """ + prompt = render_messages( + [ + {"role": "user", "content": "Look at foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ], + tools=TOOLS, + ) + assert "<|start|>functions.run_command to=assistant" in prompt + # ``[Tool Result …]`` text fallback must not appear. + assert "[Tool Result" not in prompt + + def test_assistant_thinking_renders_analysis_channel(self): + """Prior ``thinking`` text on an assistant turn that has tool_calls + must render in the analysis channel before the commentary call.""" + prompt = render_messages( + [ + {"role": "user", "content": "Find the bug."}, + { + "role": "assistant", + "content": "", + "thinking": "I should read the file first.", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "read_file", + "arguments": '{"path": "foo.py"}', + }, + } + ], + }, + ], + tools=TOOLS, + ) + assert "<|channel|>analysis" in prompt + assert "I should read the file first." in prompt + # Analysis channel must precede the commentary tool call. + analysis_pos = prompt.index("<|channel|>analysis") + call_pos = prompt.index("to=functions.read_file") + assert analysis_pos < call_pos + + def test_assistant_arguments_dict_is_serialized(self): + """Some callers pass ``arguments`` as a dict already — it should be + JSON-serialized into the commentary channel payload.""" + prompt = render_messages( + [ + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "read_file", + "arguments": {"path": "foo.py"}, + }, + } + ], + }, + ], + tools=TOOLS, + ) + assert '{"path": "foo.py"}' in prompt + + def test_final_assistant_message_renders_final_channel(self): + """A previous assistant turn with content and no tool_calls should + appear in the final channel.""" + prompt = render_messages( + [ + {"role": "user", "content": "Hi."}, + {"role": "assistant", "content": "Hello back!"}, + {"role": "user", "content": "And again."}, + ], + ) + assert "<|channel|>final<|message|>Hello back!" in prompt + + def test_generation_prompt_is_appended(self): + """The prompt must end with the bare ``<|start|>assistant`` marker so + the model knows it's its turn to generate.""" + prompt = render_messages([{"role": "user", "content": "Hi."}]) + assert prompt.rstrip().endswith("<|start|>assistant") diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index 0fb28f55e..f8bc3c32b 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -930,14 +930,33 @@ def run_stream(): template_kwargs["tools"] = template_tools safe_messages = normalize_messages_for_chat_template(messages) - try: - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) - except TypeError: - # Some templates don't support all kwargs - for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]: - if key in template_kwargs: - del template_kwargs[key] - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) + if getattr(self, "use_harmony_rendering", False): + # GPT-OSS / harmony-format models: render via openai-harmony + # instead of the Jinja chat_template. Bypasses the + # ``extract_multimodal_content`` text-flattening upstream + # (which drops structural ``tool_calls`` for non-native + # parsers) and uses OpenAI's canonical renderer. See #568. + from ..utils.harmony_render import ( + render_messages as _harmony_render_messages, + ) + + _reasoning_effort = None + if chat_template_kwargs: + _reasoning_effort = chat_template_kwargs.get("reasoning_effort") + prompt = _harmony_render_messages( + safe_messages, + tools=template_tools, + reasoning_effort=_reasoning_effort, + ) + else: + try: + prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) + except TypeError: + # Some templates don't support all kwargs + for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]: + if key in template_kwargs: + del template_kwargs[key] + prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) else: prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) prompt += "\nassistant:" @@ -1030,6 +1049,15 @@ def run_stream(): # exposes any non-KVCache entries or the probe failed. if not self._supports_system_kv_cache: cache_blocking_controls.append("non_kv_cache_class") + # The system-prefix probe (re-renders the conversation with two different + # user contents and compares the rendered strings) goes through + # ``tokenizer.apply_chat_template``. When the harmony rendering path is + # active the actual prompt is built by ``openai-harmony`` instead, so the + # probe and the prompt would diverge and the cache would never hit. + # Falling back to the uncached path keeps correctness without splitting + # the probe across both renderers. + if getattr(self, "use_harmony_rendering", False): + cache_blocking_controls.append("harmony_rendering") if cache_blocking_controls: logger.info( diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index 1de223f05..dcd4ff9c2 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -973,6 +973,7 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext: if _model_manager is None: engine = get_engine() engine.preserve_native_tool_format = _detect_native_tool_support() + engine.use_harmony_rendering = _detect_harmony_rendering() return RequestModelContext( model_name=_model_name or request_model, engine=engine ) @@ -983,6 +984,8 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext: raise HTTPException(status_code=503, detail=str(exc)) from exc lease.engine.preserve_native_tool_format = _detect_native_tool_support() + + lease.engine.use_harmony_rendering = _detect_harmony_rendering() return RequestModelContext( model_name=request_model, engine=lease.engine, @@ -1213,6 +1216,7 @@ def _activate_engine(engine: BaseEngine | None) -> BaseEngine | None: _engine = engine if _engine is not None: _engine.preserve_native_tool_format = _detect_native_tool_support() + _engine.use_harmony_rendering = _detect_harmony_rendering() return _engine @@ -2758,6 +2762,42 @@ def _detect_native_tool_support() -> bool: return False +def _detect_harmony_rendering() -> bool: + """Detect whether the harmony rendering path should handle prompt building. + + Returns True when ALL of: + - ``--tool-call-parser`` is set to ``harmony`` or ``gpt-oss`` + - ``--enable-auto-tool-choice`` is on + - the optional ``openai-harmony`` Python package is importable + + The third condition keeps non-gpt-oss deployments free of an extra + runtime dependency: if the package isn't installed, the engine falls + back to the standard ``tokenizer.apply_chat_template`` path. The + HarmonyToolParser's existing text-flatten behavior also stays in force + in that fallback so the response side is unchanged. + """ + if not _enable_auto_tool_choice or not _tool_call_parser: + return False + try: + from .utils.harmony_render import ( + HAS_HARMONY, + is_harmony_parser_name, + ) + except ImportError: + return False + if not is_harmony_parser_name(_tool_call_parser): + return False + if not HAS_HARMONY: + logger.warning( + "tool-call-parser=%s requested but `openai-harmony` is not " + "installed; falling back to tokenizer.apply_chat_template. " + "`pip install openai-harmony` to enable harmony rendering.", + _tool_call_parser, + ) + return False + return True + + def _tool_choice_disabled(request: ChatCompletionRequest | None) -> bool: """Return True when tool_choice explicitly disables tool calling.""" if request is None: @@ -3116,6 +3156,7 @@ def load_model( # Set native tool format support on the engine (thread-safe via instance property) _engine.preserve_native_tool_format = _detect_native_tool_support() + _engine.use_harmony_rendering = _detect_harmony_rendering() if _engine.preserve_native_tool_format: logger.info(f"Native tool format enabled for parser: {_tool_call_parser}") diff --git a/vllm_mlx/utils/harmony_render.py b/vllm_mlx/utils/harmony_render.py new file mode 100644 index 000000000..31ab64aac --- /dev/null +++ b/vllm_mlx/utils/harmony_render.py @@ -0,0 +1,286 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Harmony-format prompt rendering for GPT-OSS via ``openai-harmony``. + +GPT-OSS models are trained with OpenAI's harmony wire format (channeled +``<|start|>assistant<|channel|>commentary ...<|call|>`` tool calls, +``<|start|>functions.X to=assistant<|channel|>commentary<|message|>...`` +tool results, etc.). Rendering harmony correctly from OpenAI-style chat +messages is delicate: prior assistant ``tool_calls`` must arrive at the +template as structural objects, not the bracket-text fallback that +``api.utils.extract_multimodal_content()`` produces for non-native parsers. + +This module bypasses the Jinja chat template entirely for harmony-active +engines: it converts the OpenAI-format ``messages`` (plus ``tools``) to an +``openai_harmony.Conversation`` and asks the library — the canonical +renderer maintained by OpenAI — to serialize it. That sidesteps both the +text-flattening upstream and any template-vs-training-format drift. + +The library is an optional dependency. ``HAS_HARMONY`` reflects import +success so the rest of the engine can fall back to ``apply_chat_template`` +when the package is absent. + +See https://github.com/waybarrios/vllm-mlx/issues/568 for the original +report and the patch shape Thump604 outlined. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +try: + import openai_harmony as _oh + + HAS_HARMONY = True +except ImportError: + _oh = None + HAS_HARMONY = False + + +def is_harmony_parser_name(parser_name: str | None) -> bool: + """Return True when the active --tool-call-parser is a harmony alias. + + ``HarmonyToolParser`` registers under both ``"harmony"`` and ``"gpt-oss"``. + """ + return parser_name in {"harmony", "gpt-oss"} + + +def _build_tools(tools: list[dict] | None) -> list[Any] | None: + if not tools or _oh is None: + return None + tool_descs: list[Any] = [] + for t in tools: + fn = t.get("function") or t + name = fn.get("name") + if not name: + continue + tool_descs.append( + _oh.ToolDescription.new( + name=name, + description=fn.get("description") or "", + parameters=fn.get("parameters") or {}, + ) + ) + return tool_descs or None + + +def _content_to_text(content: Any) -> str: + """Flatten OpenAI content (str | list[dict]) to plain text.""" + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + parts.append(item.get("text", "")) + elif isinstance(item, str): + parts.append(item) + return "\n".join(parts) + return str(content) + + +def _convert_message(msg: dict) -> list[Any]: + """Convert one OpenAI-format message to one or more ``openai_harmony.Message``. + + A single assistant turn can carry multiple tool_calls; harmony represents + each as its own commentary-channel message addressed to ``functions.X``. + Prior reasoning lives in an analysis-channel message that precedes the + tool calls. + """ + if _oh is None: + return [] + role = msg.get("role", "user") + content_text = _content_to_text(msg.get("content")) + out: list[Any] = [] + + if role == "system": + out.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, content_text)) + elif role == "user": + out.append(_oh.Message.from_role_and_content(_oh.Role.USER, content_text)) + elif role == "tool": + # The tool name lives on a prior assistant tool_call; the caller is + # expected to thread it through ``tool_call_id``-to-name mapping + # before this conversion runs. The OpenAI schema doesn't carry the + # function name on tool messages directly, so we leave the name slot + # as the conventional ``functions.unknown`` if it isn't injected + # under the ``name`` key. + tool_name = msg.get("name") or "functions.unknown" + if not tool_name.startswith("functions."): + tool_name = f"functions.{tool_name}" + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.TOOL, name=tool_name), + content=[_oh.TextContent(text=content_text)], + channel="commentary", + recipient="assistant", + ) + ) + elif role == "assistant": + thinking = msg.get("thinking") or msg.get("reasoning_content") + tool_calls = msg.get("tool_calls") or [] + # Reasoning (analysis channel) — only renders when there are tool_calls + # to follow; the harmony chat template otherwise drops it for prior + # turns (matches gpt-oss training). + if thinking and tool_calls: + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=str(thinking))], + channel="analysis", + ) + ) + # User-visible final-channel content + if content_text and not tool_calls: + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=content_text)], + channel="final", + ) + ) + # Tool calls + for tc in tool_calls: + fn = tc.get("function") or tc + name = fn.get("name", "unknown") + args = fn.get("arguments") + if isinstance(args, dict): + args_text = json.dumps(args, ensure_ascii=False) + elif args is None: + args_text = "{}" + else: + args_text = str(args) + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=args_text)], + channel="commentary", + recipient=f"functions.{name}", + content_type="json", + ) + ) + elif role == "developer": + out.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, content_text)) + # Any other role is silently dropped (matches existing chat-template behavior) + return out + + +def _resolve_tool_names(messages: list[dict]) -> list[dict]: + """Stamp ``name=functions.X`` on each ``role=tool`` message by tracing back + the most recent assistant ``tool_call_id`` -> function name.""" + by_call_id: dict[str, str] = {} + out: list[dict] = [] + for m in messages: + if not isinstance(m, dict): + out.append(m) + continue + if m.get("role") == "assistant": + for tc in m.get("tool_calls") or []: + if not isinstance(tc, dict): + continue + tc_id = tc.get("id") + fn = tc.get("function") or {} + name = fn.get("name") + if tc_id and name: + by_call_id[tc_id] = name + out.append(m) + continue + if m.get("role") == "tool": + new_m = dict(m) + if "name" not in new_m and (tc_id := new_m.get("tool_call_id")): + name = by_call_id.get(tc_id) + if name: + new_m["name"] = name + out.append(new_m) + continue + out.append(m) + return out + + +def render_messages( + messages: list[dict], + tools: list[dict] | None = None, + reasoning_effort: str | None = None, +) -> str: + """Render OpenAI-format messages as a harmony-format prompt string. + + Raises ``RuntimeError`` if ``openai-harmony`` is not importable; callers + should pre-check with :data:`HAS_HARMONY` and fall back to + ``tokenizer.apply_chat_template`` when False. + + Args: + messages: OpenAI chat-completions messages. + tools: OpenAI-format tools list (each item ``{"type":"function","function":{...}}``). + reasoning_effort: ``"low"``, ``"medium"``, or ``"high"``. Defaults to medium. + + Returns: + Decoded harmony prompt with the trailing ``<|start|>assistant`` + marker ready for the model to begin generation. + """ + if not HAS_HARMONY: + raise RuntimeError( + "openai-harmony is not installed. `pip install openai-harmony` or " + "fall back to tokenizer.apply_chat_template." + ) + + resolved = _resolve_tool_names(messages) + + # Pull system + developer messages to the head; gpt-oss expects + # ``<|start|>system|...|<|end|><|start|>developer|...|<|end|>`` before + # any user/assistant turn. + system_msgs: list[dict] = [] + developer_msgs: list[dict] = [] + other_msgs: list[dict] = [] + for m in resolved: + if not isinstance(m, dict): + other_msgs.append(m) + continue + role = m.get("role") + if role == "system": + system_msgs.append(m) + elif role == "developer": + developer_msgs.append(m) + else: + other_msgs.append(m) + + h_messages: list[Any] = [] + tool_descs = _build_tools(tools) + + # 1. System block (inject default if caller didn't provide one). + if system_msgs: + for m in system_msgs: + h_messages.extend(_convert_message(m)) + else: + sys_content = _oh.SystemContent.new() + if reasoning_effort: + try: + level = getattr(_oh.ReasoningEffort, reasoning_effort.upper()) + sys_content = sys_content.with_reasoning_effort(level) + except Exception: # noqa: BLE001 + pass + h_messages.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content)) + + # 2. Developer block (tool schema). If caller passed an explicit + # developer message, render that; otherwise synthesize from tools. + if developer_msgs: + for m in developer_msgs: + h_messages.extend(_convert_message(m)) + elif tool_descs: + dev_content = _oh.DeveloperContent.new().with_function_tools(tool_descs) + h_messages.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content)) + + # 3. Everything else, in original order. + for m in other_msgs: + if isinstance(m, dict): + h_messages.extend(_convert_message(m)) + + conv = _oh.Conversation.from_messages(h_messages) + enc = _oh.load_harmony_encoding(_oh.HarmonyEncodingName.HARMONY_GPT_OSS) + token_ids = enc.render_conversation_for_completion( + conv, next_turn_role=_oh.Role.ASSISTANT + ) + return enc.decode(token_ids) From f827eee8c36a6479e824ecdd670fbbadb649e0f8 Mon Sep 17 00:00:00 2001 From: Carlos Bribiescas Date: Tue, 26 May 2026 21:43:51 +0800 Subject: [PATCH 2/3] address #581 review: preserve native on prep + add CI coverage + black MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @Thump604's review: 1. The harmony renderer was wired too late: `_prepare_chat_messages()` ran `extract_multimodal_content(..., preserve_native_format=False)` first (because `HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT` is still False), flattening prior assistant `tool_calls` to `[Calling tool: ...]` bracket text + `role=tool` to text-stuffed `role=user` BEFORE `render_messages()` could see them. Fix: when `engine.use_harmony_rendering` is True, the prep path forces `preserve_native = True` locally so the structural shape survives into the harmony renderer. `HarmonyToolParser` keeps its public `SUPPORTS_NATIVE_TOOL_FORMAT=False` setting — the override is harmony-scoped and lives in server prep. 2. The new renderer is now exercised in CI: - `tests/test_harmony_render.py` added to the explicit pytest list under the no-MLX `test-matrix` job (covers Python 3.10/3.11/3.12/3.13). - `openai-harmony` added to that job's pip install line so the skipif gate doesn't silently skip. - Three new regression tests assert the end-to-end server prep path keeps tool_calls structured AND that the rendered prompt never contains `[Calling tool:` or `[Tool Result`. They fail loudly if the prep-path coupling ever regresses. 3. Will update the PR description on push: "Closes #568" → "Refs #568" until the maintainer merges and confirms the served path works in their environment. 4. Black would have reformatted `vllm_mlx/engine/simple.py` and `vllm_mlx/utils/harmony_render.py`; ran with project default style. All 127 parser + native-format + harmony tests pass. --- tests/test_harmony_render.py | 151 ++++++++++++++++++++++++++++++- vllm_mlx/engine/simple.py | 14 ++- vllm_mlx/server.py | 10 ++ vllm_mlx/utils/harmony_render.py | 8 +- 4 files changed, 177 insertions(+), 6 deletions(-) diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py index 2b6c747ff..08b8021ef 100644 --- a/tests/test_harmony_render.py +++ b/tests/test_harmony_render.py @@ -1,4 +1,3 @@ -# SPDX-License-Identifier: Apache-2.0 """Regression tests for ``vllm_mlx.utils.harmony_render``. Targets the harmony rendering path enabled when ``--tool-call-parser harmony`` @@ -12,6 +11,8 @@ from __future__ import annotations +import json + import pytest from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages @@ -224,3 +225,151 @@ def test_generation_prompt_is_appended(self): the model knows it's its turn to generate.""" prompt = render_messages([{"role": "user", "content": "Hi."}]) assert prompt.rstrip().endswith("<|start|>assistant") + + +class TestServerPathPreservesNativeForHarmony: + """End-to-end regression: messages with ``tool_calls`` must survive the + server's prep step without being flattened to bracket text, so the + harmony renderer downstream sees structured calls. + + Without the ``use_harmony_rendering=True`` plumbing in + ``_prepare_chat_messages``, ``extract_multimodal_content`` runs with + ``preserve_native_format=False`` (because ``HarmonyToolParser`` keeps + ``SUPPORTS_NATIVE_TOOL_FORMAT=False`` by default) and converts assistant + ``tool_calls`` into ``[Calling tool: …]`` strings + tool messages into + ``[Tool Result …]`` strings before the LLM path's render call. The + rendered prompt then leaks that bracket text through into the + final-channel slot, defeating the entire harmony rendering effort. + + These assertions fail loudly if that regression ever returns. + """ + + @pytest.fixture + def harmony_engine(self): + class _Engine: + is_mllm = False + preserve_native_tool_format = False # harmony parser keeps this False + use_harmony_rendering = True # set by _detect_harmony_rendering() + + return _Engine() + + def test_prep_path_preserves_tool_calls_for_harmony(self, harmony_engine): + from vllm_mlx.server import _prepare_chat_messages + + request_messages = [ + {"role": "user", "content": "Find the bug in foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ] + messages, _images, _videos, _audios, _has_media = _prepare_chat_messages( + harmony_engine, request_messages + ) + + # The bracket-text fallback must not have run. + for m in messages: + content = m.get("content") + if isinstance(content, str): + assert "[Calling tool:" not in content + assert "[Tool Result" not in content + + # Assistant tool_calls survived structurally. + assistant = next(m for m in messages if m.get("role") == "assistant") + assert assistant.get("tool_calls"), "tool_calls were dropped on prep" + assert assistant["tool_calls"][0]["function"]["name"] == "run_command" + + # Tool message survived as role=tool (not flattened to role=user). + tool_msg = next(m for m in messages if m.get("role") == "tool") + assert tool_msg.get("content") == "def foo():\n return None" + + def test_rendered_prompt_after_prep_has_no_bracket_text(self, harmony_engine): + """The whole point: end-to-end, the harmony renderer must produce + commentary-channel calls — not the bracket-text leftovers from + ``extract_multimodal_content``'s legacy flatten path.""" + from vllm_mlx.server import _prepare_chat_messages + + request_messages = [ + {"role": "user", "content": "Look at foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ] + prepped, *_ = _prepare_chat_messages(harmony_engine, request_messages) + prompt = render_messages(prepped, tools=TOOLS) + + assert "[Calling tool:" not in prompt + assert "[Tool Result" not in prompt + # Structural harmony shape did make it through. + assert "<|channel|>commentary" in prompt + assert "to=functions.run_command" in prompt + assert "<|start|>functions.run_command to=assistant" in prompt + + def test_non_harmony_engine_falls_through_unchanged(self): + """When ``use_harmony_rendering`` is False (default for all + non-harmony parsers), the prep path must behave exactly as before + this patch — i.e., honor ``preserve_native_tool_format`` and + nothing else. Guards against accidentally flipping native + preservation for unrelated parsers.""" + from vllm_mlx.server import _prepare_chat_messages + + class _NoHarmonyEngine: + is_mllm = False + preserve_native_tool_format = False + # use_harmony_rendering deliberately absent — exercises the + # default-False branch via getattr(). + + engine = _NoHarmonyEngine() + request_messages = [ + {"role": "user", "content": "Hi"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "fn", "arguments": "{}"}, + } + ], + }, + ] + messages, *_ = _prepare_chat_messages(engine, request_messages) + # Without harmony rendering AND without preserve_native_tool_format, + # the legacy bracket-text flatten still runs — that's the existing + # behaviour this PR doesn't change. + assistant = next(m for m in messages if m.get("role") == "assistant") + assert "[Calling tool: fn" in (assistant.get("content") or "") diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index f8bc3c32b..83dd6cd30 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -950,13 +950,21 @@ def run_stream(): ) else: try: - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) + prompt = tokenizer.apply_chat_template( + safe_messages, **template_kwargs + ) except TypeError: # Some templates don't support all kwargs - for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]: + for key in [ + "tools", + "enable_thinking", + *chat_template_kwargs.keys(), + ]: if key in template_kwargs: del template_kwargs[key] - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) + prompt = tokenizer.apply_chat_template( + safe_messages, **template_kwargs + ) else: prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) prompt += "\nassistant:" diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index dcd4ff9c2..318e76e17 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -313,6 +313,16 @@ def _prepare_chat_messages( is_mllm = bool(getattr(engine, "is_mllm", False)) preserve_native = bool(getattr(engine, "preserve_native_tool_format", False)) + # Harmony rendering needs the structural ``tool_calls`` / ``role=tool`` + # shape to survive ``extract_multimodal_content`` — otherwise prior + # assistant tool calls reach ``render_messages()`` as ``[Calling tool: …]`` + # bracket text and the harmony renderer can't reconstruct the + # commentary channel. The flag is set by ``_detect_harmony_rendering()`` + # only when the harmony parser is active AND ``openai-harmony`` is + # importable, so non-harmony parsers and the no-extras install path see + # no change. + if bool(getattr(engine, "use_harmony_rendering", False)): + preserve_native = True if is_mllm: # For MLLM models, keep original messages with embedded images diff --git a/vllm_mlx/utils/harmony_render.py b/vllm_mlx/utils/harmony_render.py index 31ab64aac..d1a489ff9 100644 --- a/vllm_mlx/utils/harmony_render.py +++ b/vllm_mlx/utils/harmony_render.py @@ -262,7 +262,9 @@ def render_messages( sys_content = sys_content.with_reasoning_effort(level) except Exception: # noqa: BLE001 pass - h_messages.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content)) + h_messages.append( + _oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content) + ) # 2. Developer block (tool schema). If caller passed an explicit # developer message, render that; otherwise synthesize from tools. @@ -271,7 +273,9 @@ def render_messages( h_messages.extend(_convert_message(m)) elif tool_descs: dev_content = _oh.DeveloperContent.new().with_function_tools(tool_descs) - h_messages.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content)) + h_messages.append( + _oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content) + ) # 3. Everything else, in original order. for m in other_msgs: From 6376e47fc787f29b5bbe18c35bab94462fcc371f Mon Sep 17 00:00:00 2001 From: Carlos Bribiescas Date: Tue, 26 May 2026 21:48:49 +0800 Subject: [PATCH 3/3] lint(tests): drop unused json import from test_harmony_render --- tests/test_harmony_render.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py index 08b8021ef..8896f1bd5 100644 --- a/tests/test_harmony_render.py +++ b/tests/test_harmony_render.py @@ -11,8 +11,6 @@ from __future__ import annotations -import json - import pytest from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages