From a6c6ee43abb60b182a94a48340d563f60ab78c90 Mon Sep 17 00:00:00 2001
From: Carlos Bribiescas <cbribiescas@gmail.com>
Date: Mon, 25 May 2026 16:41:45 +0800
Subject: [PATCH 1/3] fix(gpt-oss): route harmony prompts through
 openai-harmony
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses #568. GPT-OSS prompts go through ``tokenizer.apply_chat_template``
today, but ``api.utils.extract_multimodal_content()`` text-flattens prior
assistant ``tool_calls`` to ``[Calling tool: X(args)]`` bracket strings when
the active tool parser doesn't preserve native format. That breaks multi-turn
agentic workloads on GPT-OSS — the model sees a malformed conversation and
falls back to one-shot final-channel responses (avg_turns ~1, no tool use).

This patch adds a separate prompt-rendering path that, only when active,
routes through OpenAI's canonical ``openai-harmony`` renderer instead of the
Jinja template. The renderer accepts structured ``Conversation`` objects
(messages + tool definitions) and emits the wire format the GPT-OSS weights
were trained on, sidestepping the text-flattening upstream entirely.

Per @Thump604's suggested patch shape on #568:

1. **Regression test** — ``tests/test_harmony_render.py`` (9 tests)
   covers the multi-turn assistant-tool/tool-result rendering, section
   order, channel placement, generation-prompt suffix, and explicitly
   asserts the bracket-text fallback NEVER appears.
2. **Scoped narrowly** — engine reads ``self.use_harmony_rendering``
   (default ``False``); set by the server only when
   ``--tool-call-parser harmony``/``gpt-oss`` is active AND ``openai-harmony``
   is importable. The flag controls one ``if`` at the prompt-render site;
   non-harmony parsers and templates are untouched.
3. **Routed through openai-harmony** — new ``vllm_mlx/utils/harmony_render.py``
   converts OpenAI-format messages + tools to a ``Conversation`` and calls
   ``HarmonyEncoding.render_conversation_for_completion``. Tool-call-id ->
   function-name resolution is done locally so the OpenAI ``role=tool``
   shape (which lacks the function name field) flows through correctly.
4. **Non-harmony behavior unchanged** — when the harmony path is inactive
   (parser is anything other than harmony/gpt-oss, OR the optional
   ``openai-harmony`` package is not installed) the existing
   ``apply_chat_template`` codepath runs verbatim. The system-prefix KV
   cache probe (which assumes the Jinja path) is gated off for harmony
   requests so the probe/actual-prompt strings can't desynchronize.

``openai-harmony`` is added as an optional extra (``pip install vllm-mlx[harmony]``)
so the default install footprint is unchanged.

All 124 existing parser tests still pass alongside the 9 new harmony tests.
---
 pyproject.toml                   |   7 +
 tests/test_harmony_render.py     | 226 ++++++++++++++++++++++++
 vllm_mlx/engine/simple.py        |  44 ++++-
 vllm_mlx/server.py               |  41 +++++
 vllm_mlx/utils/harmony_render.py | 286 +++++++++++++++++++++++++++++++
 5 files changed, 596 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_harmony_render.py
 create mode 100644 vllm_mlx/utils/harmony_render.py

diff --git a/pyproject.toml b/pyproject.toml
index 3bfe353b1..05efe7b62 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,13 @@ vision = [
     "torch>=2.8.0",
     "torchvision>=0.23.0",
 ]
+# Harmony prompt rendering for GPT-OSS models (see #568). With this installed
+# and ``--tool-call-parser harmony``/``gpt-oss`` set, the engine routes prompt
+# building through OpenAI's canonical renderer instead of the Jinja chat
+# template (bypasses the bracket-text fallback that loses ``tool_calls``).
+harmony = [
+    "openai-harmony>=0.0.8",
+]
 # Audio dependencies for TTS/STT (mlx-audio)
 audio = [
     "mlx-audio>=0.2.9",
diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py
new file mode 100644
index 000000000..2b6c747ff
--- /dev/null
+++ b/tests/test_harmony_render.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Regression tests for ``vllm_mlx.utils.harmony_render``.
+
+Targets the harmony rendering path enabled when ``--tool-call-parser harmony``
+is active (see :issue:`568`). The tests check the wire-level output for the
+shapes a multi-turn assistant-tool/tool-result conversation should produce,
+matching what GPT-OSS was trained on.
+
+These tests run only when the optional ``openai-harmony`` package is
+installed; skipped otherwise.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages
+
+pytestmark = pytest.mark.skipif(
+    not HAS_HARMONY,
+    reason="openai-harmony not installed; harmony rendering is optional",
+)
+
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "read_file",
+            "description": "Read a file in the sandbox",
+            "parameters": {
+                "type": "object",
+                "properties": {"path": {"type": "string"}},
+                "required": ["path"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "run_command",
+            "description": "Run a shell command in the sandbox",
+            "parameters": {
+                "type": "object",
+                "properties": {"cmd": {"type": "string"}},
+                "required": ["cmd"],
+            },
+        },
+    },
+]
+
+
+class TestHarmonyRender:
+    def test_single_turn_user_message(self):
+        prompt = render_messages(
+            [{"role": "user", "content": "Hello."}],
+            tools=None,
+        )
+        assert "<|start|>system" in prompt
+        assert "<|start|>user<|message|>Hello.<|end|>" in prompt
+        assert prompt.endswith("<|start|>assistant")
+
+    def test_developer_block_renders_tool_namespace(self):
+        prompt = render_messages(
+            [{"role": "user", "content": "List files."}],
+            tools=TOOLS,
+        )
+        assert "<|start|>developer<|message|>" in prompt
+        assert "namespace functions" in prompt
+        # Both tool schemas should appear
+        assert "type read_file = (_:" in prompt
+        assert "type run_command = (_:" in prompt
+
+    def test_section_order_system_developer_user(self):
+        """System must precede developer must precede the first user turn."""
+        prompt = render_messages(
+            [
+                {"role": "system", "content": "You are a coding assistant."},
+                {"role": "user", "content": "Find the bug."},
+            ],
+            tools=TOOLS,
+        )
+        sys_pos = prompt.index("<|start|>system")
+        dev_pos = prompt.index("<|start|>developer")
+        user_pos = prompt.index("<|start|>user")
+        assert sys_pos < dev_pos < user_pos
+
+    def test_assistant_tool_call_renders_commentary_channel(self):
+        """Assistant ``tool_calls`` must render in the commentary channel
+        addressed to ``functions.<name>`` (the format the model was trained
+        on). The OLD text-flattening (``[Calling tool: …]``) must NOT appear.
+        """
+        prompt = render_messages(
+            [
+                {"role": "user", "content": "Find the bug in foo.py."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {
+                                "name": "run_command",
+                                "arguments": '{"cmd": "cat foo.py"}',
+                            },
+                        }
+                    ],
+                },
+                {"role": "user", "content": "Continue."},
+            ],
+            tools=TOOLS,
+        )
+        assert "to=functions.run_command" in prompt
+        assert "<|channel|>commentary" in prompt
+        assert "<|call|>" in prompt
+        # Bracket-text fallback must NOT appear (that's the bug we're fixing).
+        assert "[Calling tool:" not in prompt
+
+    def test_tool_result_renders_with_function_name(self):
+        """``role=tool`` messages need to come back addressed from
+        ``functions.<name> to=assistant`` — the function name is resolved
+        by tracing the most recent assistant ``tool_call_id``.
+        """
+        prompt = render_messages(
+            [
+                {"role": "user", "content": "Look at foo.py."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {
+                                "name": "run_command",
+                                "arguments": '{"cmd": "cat foo.py"}',
+                            },
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "c1",
+                    "content": "def foo():\n    return None",
+                },
+                {"role": "user", "content": "Continue."},
+            ],
+            tools=TOOLS,
+        )
+        assert "<|start|>functions.run_command to=assistant" in prompt
+        # ``[Tool Result …]`` text fallback must not appear.
+        assert "[Tool Result" not in prompt
+
+    def test_assistant_thinking_renders_analysis_channel(self):
+        """Prior ``thinking`` text on an assistant turn that has tool_calls
+        must render in the analysis channel before the commentary call."""
+        prompt = render_messages(
+            [
+                {"role": "user", "content": "Find the bug."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "thinking": "I should read the file first.",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {
+                                "name": "read_file",
+                                "arguments": '{"path": "foo.py"}',
+                            },
+                        }
+                    ],
+                },
+            ],
+            tools=TOOLS,
+        )
+        assert "<|channel|>analysis" in prompt
+        assert "I should read the file first." in prompt
+        # Analysis channel must precede the commentary tool call.
+        analysis_pos = prompt.index("<|channel|>analysis")
+        call_pos = prompt.index("to=functions.read_file")
+        assert analysis_pos < call_pos
+
+    def test_assistant_arguments_dict_is_serialized(self):
+        """Some callers pass ``arguments`` as a dict already — it should be
+        JSON-serialized into the commentary channel payload."""
+        prompt = render_messages(
+            [
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {
+                                "name": "read_file",
+                                "arguments": {"path": "foo.py"},
+                            },
+                        }
+                    ],
+                },
+            ],
+            tools=TOOLS,
+        )
+        assert '{"path": "foo.py"}' in prompt
+
+    def test_final_assistant_message_renders_final_channel(self):
+        """A previous assistant turn with content and no tool_calls should
+        appear in the final channel."""
+        prompt = render_messages(
+            [
+                {"role": "user", "content": "Hi."},
+                {"role": "assistant", "content": "Hello back!"},
+                {"role": "user", "content": "And again."},
+            ],
+        )
+        assert "<|channel|>final<|message|>Hello back!" in prompt
+
+    def test_generation_prompt_is_appended(self):
+        """The prompt must end with the bare ``<|start|>assistant`` marker so
+        the model knows it's its turn to generate."""
+        prompt = render_messages([{"role": "user", "content": "Hi."}])
+        assert prompt.rstrip().endswith("<|start|>assistant")
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index 0fb28f55e..f8bc3c32b 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -930,14 +930,33 @@ def run_stream():
                 template_kwargs["tools"] = template_tools
             safe_messages = normalize_messages_for_chat_template(messages)
 
-            try:
-                prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
-            except TypeError:
-                # Some templates don't support all kwargs
-                for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]:
-                    if key in template_kwargs:
-                        del template_kwargs[key]
-                prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
+            if getattr(self, "use_harmony_rendering", False):
+                # GPT-OSS / harmony-format models: render via openai-harmony
+                # instead of the Jinja chat_template. Bypasses the
+                # ``extract_multimodal_content`` text-flattening upstream
+                # (which drops structural ``tool_calls`` for non-native
+                # parsers) and uses OpenAI's canonical renderer. See #568.
+                from ..utils.harmony_render import (
+                    render_messages as _harmony_render_messages,
+                )
+
+                _reasoning_effort = None
+                if chat_template_kwargs:
+                    _reasoning_effort = chat_template_kwargs.get("reasoning_effort")
+                prompt = _harmony_render_messages(
+                    safe_messages,
+                    tools=template_tools,
+                    reasoning_effort=_reasoning_effort,
+                )
+            else:
+                try:
+                    prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
+                except TypeError:
+                    # Some templates don't support all kwargs
+                    for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]:
+                        if key in template_kwargs:
+                            del template_kwargs[key]
+                    prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
         else:
             prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
             prompt += "\nassistant:"
@@ -1030,6 +1049,15 @@ def run_stream():
         # exposes any non-KVCache entries or the probe failed.
         if not self._supports_system_kv_cache:
             cache_blocking_controls.append("non_kv_cache_class")
+        # The system-prefix probe (re-renders the conversation with two different
+        # user contents and compares the rendered strings) goes through
+        # ``tokenizer.apply_chat_template``. When the harmony rendering path is
+        # active the actual prompt is built by ``openai-harmony`` instead, so the
+        # probe and the prompt would diverge and the cache would never hit.
+        # Falling back to the uncached path keeps correctness without splitting
+        # the probe across both renderers.
+        if getattr(self, "use_harmony_rendering", False):
+            cache_blocking_controls.append("harmony_rendering")
 
         if cache_blocking_controls:
             logger.info(
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index 1de223f05..dcd4ff9c2 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -973,6 +973,7 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext:
     if _model_manager is None:
         engine = get_engine()
         engine.preserve_native_tool_format = _detect_native_tool_support()
+        engine.use_harmony_rendering = _detect_harmony_rendering()
         return RequestModelContext(
             model_name=_model_name or request_model, engine=engine
         )
@@ -983,6 +984,8 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext:
         raise HTTPException(status_code=503, detail=str(exc)) from exc
 
     lease.engine.preserve_native_tool_format = _detect_native_tool_support()
+
+    lease.engine.use_harmony_rendering = _detect_harmony_rendering()
     return RequestModelContext(
         model_name=request_model,
         engine=lease.engine,
@@ -1213,6 +1216,7 @@ def _activate_engine(engine: BaseEngine | None) -> BaseEngine | None:
     _engine = engine
     if _engine is not None:
         _engine.preserve_native_tool_format = _detect_native_tool_support()
+        _engine.use_harmony_rendering = _detect_harmony_rendering()
     return _engine
 
 
@@ -2758,6 +2762,42 @@ def _detect_native_tool_support() -> bool:
         return False
 
 
+def _detect_harmony_rendering() -> bool:
+    """Detect whether the harmony rendering path should handle prompt building.
+
+    Returns True when ALL of:
+    - ``--tool-call-parser`` is set to ``harmony`` or ``gpt-oss``
+    - ``--enable-auto-tool-choice`` is on
+    - the optional ``openai-harmony`` Python package is importable
+
+    The third condition keeps non-gpt-oss deployments free of an extra
+    runtime dependency: if the package isn't installed, the engine falls
+    back to the standard ``tokenizer.apply_chat_template`` path. The
+    HarmonyToolParser's existing text-flatten behavior also stays in force
+    in that fallback so the response side is unchanged.
+    """
+    if not _enable_auto_tool_choice or not _tool_call_parser:
+        return False
+    try:
+        from .utils.harmony_render import (
+            HAS_HARMONY,
+            is_harmony_parser_name,
+        )
+    except ImportError:
+        return False
+    if not is_harmony_parser_name(_tool_call_parser):
+        return False
+    if not HAS_HARMONY:
+        logger.warning(
+            "tool-call-parser=%s requested but `openai-harmony` is not "
+            "installed; falling back to tokenizer.apply_chat_template. "
+            "`pip install openai-harmony` to enable harmony rendering.",
+            _tool_call_parser,
+        )
+        return False
+    return True
+
+
 def _tool_choice_disabled(request: ChatCompletionRequest | None) -> bool:
     """Return True when tool_choice explicitly disables tool calling."""
     if request is None:
@@ -3116,6 +3156,7 @@ def load_model(
 
     # Set native tool format support on the engine (thread-safe via instance property)
     _engine.preserve_native_tool_format = _detect_native_tool_support()
+    _engine.use_harmony_rendering = _detect_harmony_rendering()
     if _engine.preserve_native_tool_format:
         logger.info(f"Native tool format enabled for parser: {_tool_call_parser}")
 
diff --git a/vllm_mlx/utils/harmony_render.py b/vllm_mlx/utils/harmony_render.py
new file mode 100644
index 000000000..31ab64aac
--- /dev/null
+++ b/vllm_mlx/utils/harmony_render.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Harmony-format prompt rendering for GPT-OSS via ``openai-harmony``.
+
+GPT-OSS models are trained with OpenAI's harmony wire format (channeled
+``<|start|>assistant<|channel|>commentary ...<|call|>`` tool calls,
+``<|start|>functions.X to=assistant<|channel|>commentary<|message|>...``
+tool results, etc.). Rendering harmony correctly from OpenAI-style chat
+messages is delicate: prior assistant ``tool_calls`` must arrive at the
+template as structural objects, not the bracket-text fallback that
+``api.utils.extract_multimodal_content()`` produces for non-native parsers.
+
+This module bypasses the Jinja chat template entirely for harmony-active
+engines: it converts the OpenAI-format ``messages`` (plus ``tools``) to an
+``openai_harmony.Conversation`` and asks the library — the canonical
+renderer maintained by OpenAI — to serialize it. That sidesteps both the
+text-flattening upstream and any template-vs-training-format drift.
+
+The library is an optional dependency. ``HAS_HARMONY`` reflects import
+success so the rest of the engine can fall back to ``apply_chat_template``
+when the package is absent.
+
+See https://github.com/waybarrios/vllm-mlx/issues/568 for the original
+report and the patch shape Thump604 outlined.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+try:
+    import openai_harmony as _oh
+
+    HAS_HARMONY = True
+except ImportError:
+    _oh = None
+    HAS_HARMONY = False
+
+
+def is_harmony_parser_name(parser_name: str | None) -> bool:
+    """Return True when the active --tool-call-parser is a harmony alias.
+
+    ``HarmonyToolParser`` registers under both ``"harmony"`` and ``"gpt-oss"``.
+    """
+    return parser_name in {"harmony", "gpt-oss"}
+
+
+def _build_tools(tools: list[dict] | None) -> list[Any] | None:
+    if not tools or _oh is None:
+        return None
+    tool_descs: list[Any] = []
+    for t in tools:
+        fn = t.get("function") or t
+        name = fn.get("name")
+        if not name:
+            continue
+        tool_descs.append(
+            _oh.ToolDescription.new(
+                name=name,
+                description=fn.get("description") or "",
+                parameters=fn.get("parameters") or {},
+            )
+        )
+    return tool_descs or None
+
+
+def _content_to_text(content: Any) -> str:
+    """Flatten OpenAI content (str | list[dict]) to plain text."""
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                parts.append(item.get("text", ""))
+            elif isinstance(item, str):
+                parts.append(item)
+        return "\n".join(parts)
+    return str(content)
+
+
+def _convert_message(msg: dict) -> list[Any]:
+    """Convert one OpenAI-format message to one or more ``openai_harmony.Message``.
+
+    A single assistant turn can carry multiple tool_calls; harmony represents
+    each as its own commentary-channel message addressed to ``functions.X``.
+    Prior reasoning lives in an analysis-channel message that precedes the
+    tool calls.
+    """
+    if _oh is None:
+        return []
+    role = msg.get("role", "user")
+    content_text = _content_to_text(msg.get("content"))
+    out: list[Any] = []
+
+    if role == "system":
+        out.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, content_text))
+    elif role == "user":
+        out.append(_oh.Message.from_role_and_content(_oh.Role.USER, content_text))
+    elif role == "tool":
+        # The tool name lives on a prior assistant tool_call; the caller is
+        # expected to thread it through ``tool_call_id``-to-name mapping
+        # before this conversion runs. The OpenAI schema doesn't carry the
+        # function name on tool messages directly, so we leave the name slot
+        # as the conventional ``functions.unknown`` if it isn't injected
+        # under the ``name`` key.
+        tool_name = msg.get("name") or "functions.unknown"
+        if not tool_name.startswith("functions."):
+            tool_name = f"functions.{tool_name}"
+        out.append(
+            _oh.Message(
+                author=_oh.Author.new(_oh.Role.TOOL, name=tool_name),
+                content=[_oh.TextContent(text=content_text)],
+                channel="commentary",
+                recipient="assistant",
+            )
+        )
+    elif role == "assistant":
+        thinking = msg.get("thinking") or msg.get("reasoning_content")
+        tool_calls = msg.get("tool_calls") or []
+        # Reasoning (analysis channel) — only renders when there are tool_calls
+        # to follow; the harmony chat template otherwise drops it for prior
+        # turns (matches gpt-oss training).
+        if thinking and tool_calls:
+            out.append(
+                _oh.Message(
+                    author=_oh.Author.new(_oh.Role.ASSISTANT, name=None),
+                    content=[_oh.TextContent(text=str(thinking))],
+                    channel="analysis",
+                )
+            )
+        # User-visible final-channel content
+        if content_text and not tool_calls:
+            out.append(
+                _oh.Message(
+                    author=_oh.Author.new(_oh.Role.ASSISTANT, name=None),
+                    content=[_oh.TextContent(text=content_text)],
+                    channel="final",
+                )
+            )
+        # Tool calls
+        for tc in tool_calls:
+            fn = tc.get("function") or tc
+            name = fn.get("name", "unknown")
+            args = fn.get("arguments")
+            if isinstance(args, dict):
+                args_text = json.dumps(args, ensure_ascii=False)
+            elif args is None:
+                args_text = "{}"
+            else:
+                args_text = str(args)
+            out.append(
+                _oh.Message(
+                    author=_oh.Author.new(_oh.Role.ASSISTANT, name=None),
+                    content=[_oh.TextContent(text=args_text)],
+                    channel="commentary",
+                    recipient=f"functions.{name}",
+                    content_type="json",
+                )
+            )
+    elif role == "developer":
+        out.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, content_text))
+    # Any other role is silently dropped (matches existing chat-template behavior)
+    return out
+
+
+def _resolve_tool_names(messages: list[dict]) -> list[dict]:
+    """Stamp ``name=functions.X`` on each ``role=tool`` message by tracing back
+    the most recent assistant ``tool_call_id`` -> function name."""
+    by_call_id: dict[str, str] = {}
+    out: list[dict] = []
+    for m in messages:
+        if not isinstance(m, dict):
+            out.append(m)
+            continue
+        if m.get("role") == "assistant":
+            for tc in m.get("tool_calls") or []:
+                if not isinstance(tc, dict):
+                    continue
+                tc_id = tc.get("id")
+                fn = tc.get("function") or {}
+                name = fn.get("name")
+                if tc_id and name:
+                    by_call_id[tc_id] = name
+            out.append(m)
+            continue
+        if m.get("role") == "tool":
+            new_m = dict(m)
+            if "name" not in new_m and (tc_id := new_m.get("tool_call_id")):
+                name = by_call_id.get(tc_id)
+                if name:
+                    new_m["name"] = name
+            out.append(new_m)
+            continue
+        out.append(m)
+    return out
+
+
+def render_messages(
+    messages: list[dict],
+    tools: list[dict] | None = None,
+    reasoning_effort: str | None = None,
+) -> str:
+    """Render OpenAI-format messages as a harmony-format prompt string.
+
+    Raises ``RuntimeError`` if ``openai-harmony`` is not importable; callers
+    should pre-check with :data:`HAS_HARMONY` and fall back to
+    ``tokenizer.apply_chat_template`` when False.
+
+    Args:
+        messages: OpenAI chat-completions messages.
+        tools: OpenAI-format tools list (each item ``{"type":"function","function":{...}}``).
+        reasoning_effort: ``"low"``, ``"medium"``, or ``"high"``. Defaults to medium.
+
+    Returns:
+        Decoded harmony prompt with the trailing ``<|start|>assistant``
+        marker ready for the model to begin generation.
+    """
+    if not HAS_HARMONY:
+        raise RuntimeError(
+            "openai-harmony is not installed. `pip install openai-harmony` or "
+            "fall back to tokenizer.apply_chat_template."
+        )
+
+    resolved = _resolve_tool_names(messages)
+
+    # Pull system + developer messages to the head; gpt-oss expects
+    # ``<|start|>system|...|<|end|><|start|>developer|...|<|end|>`` before
+    # any user/assistant turn.
+    system_msgs: list[dict] = []
+    developer_msgs: list[dict] = []
+    other_msgs: list[dict] = []
+    for m in resolved:
+        if not isinstance(m, dict):
+            other_msgs.append(m)
+            continue
+        role = m.get("role")
+        if role == "system":
+            system_msgs.append(m)
+        elif role == "developer":
+            developer_msgs.append(m)
+        else:
+            other_msgs.append(m)
+
+    h_messages: list[Any] = []
+    tool_descs = _build_tools(tools)
+
+    # 1. System block (inject default if caller didn't provide one).
+    if system_msgs:
+        for m in system_msgs:
+            h_messages.extend(_convert_message(m))
+    else:
+        sys_content = _oh.SystemContent.new()
+        if reasoning_effort:
+            try:
+                level = getattr(_oh.ReasoningEffort, reasoning_effort.upper())
+                sys_content = sys_content.with_reasoning_effort(level)
+            except Exception:  # noqa: BLE001
+                pass
+        h_messages.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content))
+
+    # 2. Developer block (tool schema). If caller passed an explicit
+    #    developer message, render that; otherwise synthesize from tools.
+    if developer_msgs:
+        for m in developer_msgs:
+            h_messages.extend(_convert_message(m))
+    elif tool_descs:
+        dev_content = _oh.DeveloperContent.new().with_function_tools(tool_descs)
+        h_messages.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content))
+
+    # 3. Everything else, in original order.
+    for m in other_msgs:
+        if isinstance(m, dict):
+            h_messages.extend(_convert_message(m))
+
+    conv = _oh.Conversation.from_messages(h_messages)
+    enc = _oh.load_harmony_encoding(_oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+    token_ids = enc.render_conversation_for_completion(
+        conv, next_turn_role=_oh.Role.ASSISTANT
+    )
+    return enc.decode(token_ids)

From f827eee8c36a6479e824ecdd670fbbadb649e0f8 Mon Sep 17 00:00:00 2001
From: Carlos Bribiescas <cbribiescas@gmail.com>
Date: Tue, 26 May 2026 21:43:51 +0800
Subject: [PATCH 2/3] address #581 review: preserve native on prep + add CI
 coverage + black
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per @Thump604's review:

1. The harmony renderer was wired too late: `_prepare_chat_messages()`
   ran `extract_multimodal_content(..., preserve_native_format=False)`
   first (because `HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT` is
   still False), flattening prior assistant `tool_calls` to
   `[Calling tool: ...]` bracket text + `role=tool` to text-stuffed
   `role=user` BEFORE `render_messages()` could see them. Fix: when
   `engine.use_harmony_rendering` is True, the prep path forces
   `preserve_native = True` locally so the structural shape survives
   into the harmony renderer. `HarmonyToolParser` keeps its public
   `SUPPORTS_NATIVE_TOOL_FORMAT=False` setting — the override is
   harmony-scoped and lives in server prep.

2. The new renderer is now exercised in CI:
   - `tests/test_harmony_render.py` added to the explicit pytest list
     under the no-MLX `test-matrix` job (covers Python 3.10/3.11/3.12/3.13).
   - `openai-harmony` added to that job's pip install line so the
     skipif gate doesn't silently skip.
   - Three new regression tests assert the end-to-end server prep path
     keeps tool_calls structured AND that the rendered prompt never
     contains `[Calling tool:` or `[Tool Result`. They fail loudly if
     the prep-path coupling ever regresses.

3. Will update the PR description on push: "Closes #568" → "Refs #568"
   until the maintainer merges and confirms the served path works in
   their environment.

4. Black would have reformatted `vllm_mlx/engine/simple.py` and
   `vllm_mlx/utils/harmony_render.py`; ran with project default style.

All 127 parser + native-format + harmony tests pass.
---
 tests/test_harmony_render.py     | 151 ++++++++++++++++++++++++++++++-
 vllm_mlx/engine/simple.py        |  14 ++-
 vllm_mlx/server.py               |  10 ++
 vllm_mlx/utils/harmony_render.py |   8 +-
 4 files changed, 177 insertions(+), 6 deletions(-)

diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py
index 2b6c747ff..08b8021ef 100644
--- a/tests/test_harmony_render.py
+++ b/tests/test_harmony_render.py
@@ -1,4 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
 """Regression tests for ``vllm_mlx.utils.harmony_render``.
 
 Targets the harmony rendering path enabled when ``--tool-call-parser harmony``
@@ -12,6 +11,8 @@
 
 from __future__ import annotations
 
+import json
+
 import pytest
 
 from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages
@@ -224,3 +225,151 @@ def test_generation_prompt_is_appended(self):
         the model knows it's its turn to generate."""
         prompt = render_messages([{"role": "user", "content": "Hi."}])
         assert prompt.rstrip().endswith("<|start|>assistant")
+
+
+class TestServerPathPreservesNativeForHarmony:
+    """End-to-end regression: messages with ``tool_calls`` must survive the
+    server's prep step without being flattened to bracket text, so the
+    harmony renderer downstream sees structured calls.
+
+    Without the ``use_harmony_rendering=True`` plumbing in
+    ``_prepare_chat_messages``, ``extract_multimodal_content`` runs with
+    ``preserve_native_format=False`` (because ``HarmonyToolParser`` keeps
+    ``SUPPORTS_NATIVE_TOOL_FORMAT=False`` by default) and converts assistant
+    ``tool_calls`` into ``[Calling tool: …]`` strings + tool messages into
+    ``[Tool Result …]`` strings before the LLM path's render call. The
+    rendered prompt then leaks that bracket text through into the
+    final-channel slot, defeating the entire harmony rendering effort.
+
+    These assertions fail loudly if that regression ever returns.
+    """
+
+    @pytest.fixture
+    def harmony_engine(self):
+        class _Engine:
+            is_mllm = False
+            preserve_native_tool_format = False  # harmony parser keeps this False
+            use_harmony_rendering = True  # set by _detect_harmony_rendering()
+
+        return _Engine()
+
+    def test_prep_path_preserves_tool_calls_for_harmony(self, harmony_engine):
+        from vllm_mlx.server import _prepare_chat_messages
+
+        request_messages = [
+            {"role": "user", "content": "Find the bug in foo.py."},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "type": "function",
+                        "function": {
+                            "name": "run_command",
+                            "arguments": '{"cmd": "cat foo.py"}',
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "c1",
+                "content": "def foo():\n    return None",
+            },
+            {"role": "user", "content": "Continue."},
+        ]
+        messages, _images, _videos, _audios, _has_media = _prepare_chat_messages(
+            harmony_engine, request_messages
+        )
+
+        # The bracket-text fallback must not have run.
+        for m in messages:
+            content = m.get("content")
+            if isinstance(content, str):
+                assert "[Calling tool:" not in content
+                assert "[Tool Result" not in content
+
+        # Assistant tool_calls survived structurally.
+        assistant = next(m for m in messages if m.get("role") == "assistant")
+        assert assistant.get("tool_calls"), "tool_calls were dropped on prep"
+        assert assistant["tool_calls"][0]["function"]["name"] == "run_command"
+
+        # Tool message survived as role=tool (not flattened to role=user).
+        tool_msg = next(m for m in messages if m.get("role") == "tool")
+        assert tool_msg.get("content") == "def foo():\n    return None"
+
+    def test_rendered_prompt_after_prep_has_no_bracket_text(self, harmony_engine):
+        """The whole point: end-to-end, the harmony renderer must produce
+        commentary-channel calls — not the bracket-text leftovers from
+        ``extract_multimodal_content``'s legacy flatten path."""
+        from vllm_mlx.server import _prepare_chat_messages
+
+        request_messages = [
+            {"role": "user", "content": "Look at foo.py."},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "type": "function",
+                        "function": {
+                            "name": "run_command",
+                            "arguments": '{"cmd": "cat foo.py"}',
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "c1",
+                "content": "def foo():\n    return None",
+            },
+            {"role": "user", "content": "Continue."},
+        ]
+        prepped, *_ = _prepare_chat_messages(harmony_engine, request_messages)
+        prompt = render_messages(prepped, tools=TOOLS)
+
+        assert "[Calling tool:" not in prompt
+        assert "[Tool Result" not in prompt
+        # Structural harmony shape did make it through.
+        assert "<|channel|>commentary" in prompt
+        assert "to=functions.run_command" in prompt
+        assert "<|start|>functions.run_command to=assistant" in prompt
+
+    def test_non_harmony_engine_falls_through_unchanged(self):
+        """When ``use_harmony_rendering`` is False (default for all
+        non-harmony parsers), the prep path must behave exactly as before
+        this patch — i.e., honor ``preserve_native_tool_format`` and
+        nothing else. Guards against accidentally flipping native
+        preservation for unrelated parsers."""
+        from vllm_mlx.server import _prepare_chat_messages
+
+        class _NoHarmonyEngine:
+            is_mllm = False
+            preserve_native_tool_format = False
+            # use_harmony_rendering deliberately absent — exercises the
+            # default-False branch via getattr().
+
+        engine = _NoHarmonyEngine()
+        request_messages = [
+            {"role": "user", "content": "Hi"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "type": "function",
+                        "function": {"name": "fn", "arguments": "{}"},
+                    }
+                ],
+            },
+        ]
+        messages, *_ = _prepare_chat_messages(engine, request_messages)
+        # Without harmony rendering AND without preserve_native_tool_format,
+        # the legacy bracket-text flatten still runs — that's the existing
+        # behaviour this PR doesn't change.
+        assistant = next(m for m in messages if m.get("role") == "assistant")
+        assert "[Calling tool: fn" in (assistant.get("content") or "")
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index f8bc3c32b..83dd6cd30 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -950,13 +950,21 @@ def run_stream():
                 )
             else:
                 try:
-                    prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
+                    prompt = tokenizer.apply_chat_template(
+                        safe_messages, **template_kwargs
+                    )
                 except TypeError:
                     # Some templates don't support all kwargs
-                    for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]:
+                    for key in [
+                        "tools",
+                        "enable_thinking",
+                        *chat_template_kwargs.keys(),
+                    ]:
                         if key in template_kwargs:
                             del template_kwargs[key]
-                    prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs)
+                    prompt = tokenizer.apply_chat_template(
+                        safe_messages, **template_kwargs
+                    )
         else:
             prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
             prompt += "\nassistant:"
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index dcd4ff9c2..318e76e17 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -313,6 +313,16 @@ def _prepare_chat_messages(
 
     is_mllm = bool(getattr(engine, "is_mllm", False))
     preserve_native = bool(getattr(engine, "preserve_native_tool_format", False))
+    # Harmony rendering needs the structural ``tool_calls`` / ``role=tool``
+    # shape to survive ``extract_multimodal_content`` — otherwise prior
+    # assistant tool calls reach ``render_messages()`` as ``[Calling tool: …]``
+    # bracket text and the harmony renderer can't reconstruct the
+    # commentary channel. The flag is set by ``_detect_harmony_rendering()``
+    # only when the harmony parser is active AND ``openai-harmony`` is
+    # importable, so non-harmony parsers and the no-extras install path see
+    # no change.
+    if bool(getattr(engine, "use_harmony_rendering", False)):
+        preserve_native = True
 
     if is_mllm:
         # For MLLM models, keep original messages with embedded images
diff --git a/vllm_mlx/utils/harmony_render.py b/vllm_mlx/utils/harmony_render.py
index 31ab64aac..d1a489ff9 100644
--- a/vllm_mlx/utils/harmony_render.py
+++ b/vllm_mlx/utils/harmony_render.py
@@ -262,7 +262,9 @@ def render_messages(
                 sys_content = sys_content.with_reasoning_effort(level)
             except Exception:  # noqa: BLE001
                 pass
-        h_messages.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content))
+        h_messages.append(
+            _oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content)
+        )
 
     # 2. Developer block (tool schema). If caller passed an explicit
     #    developer message, render that; otherwise synthesize from tools.
@@ -271,7 +273,9 @@ def render_messages(
             h_messages.extend(_convert_message(m))
     elif tool_descs:
         dev_content = _oh.DeveloperContent.new().with_function_tools(tool_descs)
-        h_messages.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content))
+        h_messages.append(
+            _oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content)
+        )
 
     # 3. Everything else, in original order.
     for m in other_msgs:

From 6376e47fc787f29b5bbe18c35bab94462fcc371f Mon Sep 17 00:00:00 2001
From: Carlos Bribiescas <cbribiescas@gmail.com>
Date: Tue, 26 May 2026 21:48:49 +0800
Subject: [PATCH 3/3] lint(tests): drop unused json import from
 test_harmony_render

---
 tests/test_harmony_render.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py
index 08b8021ef..8896f1bd5 100644
--- a/tests/test_harmony_render.py
+++ b/tests/test_harmony_render.py
@@ -11,8 +11,6 @@
 
 from __future__ import annotations
 
-import json
-
 import pytest
 
 from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages