diff --git a/pyproject.toml b/pyproject.toml index 3bfe353b1..05efe7b62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,13 @@ vision = [ "torch>=2.8.0", "torchvision>=0.23.0", ] +# Harmony prompt rendering for GPT-OSS models (see #568). With this installed +# and ``--tool-call-parser harmony``/``gpt-oss`` set, the engine routes prompt +# building through OpenAI's canonical renderer instead of the Jinja chat +# template (bypasses the bracket-text fallback that loses ``tool_calls``). +harmony = [ + "openai-harmony>=0.0.8", +] # Audio dependencies for TTS/STT (mlx-audio) audio = [ "mlx-audio>=0.2.9", diff --git a/tests/test_harmony_render.py b/tests/test_harmony_render.py new file mode 100644 index 000000000..8896f1bd5 --- /dev/null +++ b/tests/test_harmony_render.py @@ -0,0 +1,373 @@ +"""Regression tests for ``vllm_mlx.utils.harmony_render``. + +Targets the harmony rendering path enabled when ``--tool-call-parser harmony`` +is active (see :issue:`568`). The tests check the wire-level output for the +shapes a multi-turn assistant-tool/tool-result conversation should produce, +matching what GPT-OSS was trained on. + +These tests run only when the optional ``openai-harmony`` package is +installed; skipped otherwise. +""" + +from __future__ import annotations + +import pytest + +from vllm_mlx.utils.harmony_render import HAS_HARMONY, render_messages + +pytestmark = pytest.mark.skipif( + not HAS_HARMONY, + reason="openai-harmony not installed; harmony rendering is optional", +) + + +TOOLS = [ + { + "type": "function", + "function": { + "name": "read_file", + "description": "Read a file in the sandbox", + "parameters": { + "type": "object", + "properties": {"path": {"type": "string"}}, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "run_command", + "description": "Run a shell command in the sandbox", + "parameters": { + "type": "object", + "properties": {"cmd": {"type": "string"}}, + "required": ["cmd"], + }, + }, + }, +] + + +class TestHarmonyRender: + def test_single_turn_user_message(self): + prompt = render_messages( + [{"role": "user", "content": "Hello."}], + tools=None, + ) + assert "<|start|>system" in prompt + assert "<|start|>user<|message|>Hello.<|end|>" in prompt + assert prompt.endswith("<|start|>assistant") + + def test_developer_block_renders_tool_namespace(self): + prompt = render_messages( + [{"role": "user", "content": "List files."}], + tools=TOOLS, + ) + assert "<|start|>developer<|message|>" in prompt + assert "namespace functions" in prompt + # Both tool schemas should appear + assert "type read_file = (_:" in prompt + assert "type run_command = (_:" in prompt + + def test_section_order_system_developer_user(self): + """System must precede developer must precede the first user turn.""" + prompt = render_messages( + [ + {"role": "system", "content": "You are a coding assistant."}, + {"role": "user", "content": "Find the bug."}, + ], + tools=TOOLS, + ) + sys_pos = prompt.index("<|start|>system") + dev_pos = prompt.index("<|start|>developer") + user_pos = prompt.index("<|start|>user") + assert sys_pos < dev_pos < user_pos + + def test_assistant_tool_call_renders_commentary_channel(self): + """Assistant ``tool_calls`` must render in the commentary channel + addressed to ``functions.`` (the format the model was trained + on). The OLD text-flattening (``[Calling tool: …]``) must NOT appear. + """ + prompt = render_messages( + [ + {"role": "user", "content": "Find the bug in foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + {"role": "user", "content": "Continue."}, + ], + tools=TOOLS, + ) + assert "to=functions.run_command" in prompt + assert "<|channel|>commentary" in prompt + assert "<|call|>" in prompt + # Bracket-text fallback must NOT appear (that's the bug we're fixing). + assert "[Calling tool:" not in prompt + + def test_tool_result_renders_with_function_name(self): + """``role=tool`` messages need to come back addressed from + ``functions. to=assistant`` — the function name is resolved + by tracing the most recent assistant ``tool_call_id``. + """ + prompt = render_messages( + [ + {"role": "user", "content": "Look at foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ], + tools=TOOLS, + ) + assert "<|start|>functions.run_command to=assistant" in prompt + # ``[Tool Result …]`` text fallback must not appear. + assert "[Tool Result" not in prompt + + def test_assistant_thinking_renders_analysis_channel(self): + """Prior ``thinking`` text on an assistant turn that has tool_calls + must render in the analysis channel before the commentary call.""" + prompt = render_messages( + [ + {"role": "user", "content": "Find the bug."}, + { + "role": "assistant", + "content": "", + "thinking": "I should read the file first.", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "read_file", + "arguments": '{"path": "foo.py"}', + }, + } + ], + }, + ], + tools=TOOLS, + ) + assert "<|channel|>analysis" in prompt + assert "I should read the file first." in prompt + # Analysis channel must precede the commentary tool call. + analysis_pos = prompt.index("<|channel|>analysis") + call_pos = prompt.index("to=functions.read_file") + assert analysis_pos < call_pos + + def test_assistant_arguments_dict_is_serialized(self): + """Some callers pass ``arguments`` as a dict already — it should be + JSON-serialized into the commentary channel payload.""" + prompt = render_messages( + [ + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "read_file", + "arguments": {"path": "foo.py"}, + }, + } + ], + }, + ], + tools=TOOLS, + ) + assert '{"path": "foo.py"}' in prompt + + def test_final_assistant_message_renders_final_channel(self): + """A previous assistant turn with content and no tool_calls should + appear in the final channel.""" + prompt = render_messages( + [ + {"role": "user", "content": "Hi."}, + {"role": "assistant", "content": "Hello back!"}, + {"role": "user", "content": "And again."}, + ], + ) + assert "<|channel|>final<|message|>Hello back!" in prompt + + def test_generation_prompt_is_appended(self): + """The prompt must end with the bare ``<|start|>assistant`` marker so + the model knows it's its turn to generate.""" + prompt = render_messages([{"role": "user", "content": "Hi."}]) + assert prompt.rstrip().endswith("<|start|>assistant") + + +class TestServerPathPreservesNativeForHarmony: + """End-to-end regression: messages with ``tool_calls`` must survive the + server's prep step without being flattened to bracket text, so the + harmony renderer downstream sees structured calls. + + Without the ``use_harmony_rendering=True`` plumbing in + ``_prepare_chat_messages``, ``extract_multimodal_content`` runs with + ``preserve_native_format=False`` (because ``HarmonyToolParser`` keeps + ``SUPPORTS_NATIVE_TOOL_FORMAT=False`` by default) and converts assistant + ``tool_calls`` into ``[Calling tool: …]`` strings + tool messages into + ``[Tool Result …]`` strings before the LLM path's render call. The + rendered prompt then leaks that bracket text through into the + final-channel slot, defeating the entire harmony rendering effort. + + These assertions fail loudly if that regression ever returns. + """ + + @pytest.fixture + def harmony_engine(self): + class _Engine: + is_mllm = False + preserve_native_tool_format = False # harmony parser keeps this False + use_harmony_rendering = True # set by _detect_harmony_rendering() + + return _Engine() + + def test_prep_path_preserves_tool_calls_for_harmony(self, harmony_engine): + from vllm_mlx.server import _prepare_chat_messages + + request_messages = [ + {"role": "user", "content": "Find the bug in foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ] + messages, _images, _videos, _audios, _has_media = _prepare_chat_messages( + harmony_engine, request_messages + ) + + # The bracket-text fallback must not have run. + for m in messages: + content = m.get("content") + if isinstance(content, str): + assert "[Calling tool:" not in content + assert "[Tool Result" not in content + + # Assistant tool_calls survived structurally. + assistant = next(m for m in messages if m.get("role") == "assistant") + assert assistant.get("tool_calls"), "tool_calls were dropped on prep" + assert assistant["tool_calls"][0]["function"]["name"] == "run_command" + + # Tool message survived as role=tool (not flattened to role=user). + tool_msg = next(m for m in messages if m.get("role") == "tool") + assert tool_msg.get("content") == "def foo():\n return None" + + def test_rendered_prompt_after_prep_has_no_bracket_text(self, harmony_engine): + """The whole point: end-to-end, the harmony renderer must produce + commentary-channel calls — not the bracket-text leftovers from + ``extract_multimodal_content``'s legacy flatten path.""" + from vllm_mlx.server import _prepare_chat_messages + + request_messages = [ + {"role": "user", "content": "Look at foo.py."}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "run_command", + "arguments": '{"cmd": "cat foo.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": "def foo():\n return None", + }, + {"role": "user", "content": "Continue."}, + ] + prepped, *_ = _prepare_chat_messages(harmony_engine, request_messages) + prompt = render_messages(prepped, tools=TOOLS) + + assert "[Calling tool:" not in prompt + assert "[Tool Result" not in prompt + # Structural harmony shape did make it through. + assert "<|channel|>commentary" in prompt + assert "to=functions.run_command" in prompt + assert "<|start|>functions.run_command to=assistant" in prompt + + def test_non_harmony_engine_falls_through_unchanged(self): + """When ``use_harmony_rendering`` is False (default for all + non-harmony parsers), the prep path must behave exactly as before + this patch — i.e., honor ``preserve_native_tool_format`` and + nothing else. Guards against accidentally flipping native + preservation for unrelated parsers.""" + from vllm_mlx.server import _prepare_chat_messages + + class _NoHarmonyEngine: + is_mllm = False + preserve_native_tool_format = False + # use_harmony_rendering deliberately absent — exercises the + # default-False branch via getattr(). + + engine = _NoHarmonyEngine() + request_messages = [ + {"role": "user", "content": "Hi"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "fn", "arguments": "{}"}, + } + ], + }, + ] + messages, *_ = _prepare_chat_messages(engine, request_messages) + # Without harmony rendering AND without preserve_native_tool_format, + # the legacy bracket-text flatten still runs — that's the existing + # behaviour this PR doesn't change. + assistant = next(m for m in messages if m.get("role") == "assistant") + assert "[Calling tool: fn" in (assistant.get("content") or "") diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index 0fb28f55e..83dd6cd30 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -930,14 +930,41 @@ def run_stream(): template_kwargs["tools"] = template_tools safe_messages = normalize_messages_for_chat_template(messages) - try: - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) - except TypeError: - # Some templates don't support all kwargs - for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]: - if key in template_kwargs: - del template_kwargs[key] - prompt = tokenizer.apply_chat_template(safe_messages, **template_kwargs) + if getattr(self, "use_harmony_rendering", False): + # GPT-OSS / harmony-format models: render via openai-harmony + # instead of the Jinja chat_template. Bypasses the + # ``extract_multimodal_content`` text-flattening upstream + # (which drops structural ``tool_calls`` for non-native + # parsers) and uses OpenAI's canonical renderer. See #568. + from ..utils.harmony_render import ( + render_messages as _harmony_render_messages, + ) + + _reasoning_effort = None + if chat_template_kwargs: + _reasoning_effort = chat_template_kwargs.get("reasoning_effort") + prompt = _harmony_render_messages( + safe_messages, + tools=template_tools, + reasoning_effort=_reasoning_effort, + ) + else: + try: + prompt = tokenizer.apply_chat_template( + safe_messages, **template_kwargs + ) + except TypeError: + # Some templates don't support all kwargs + for key in [ + "tools", + "enable_thinking", + *chat_template_kwargs.keys(), + ]: + if key in template_kwargs: + del template_kwargs[key] + prompt = tokenizer.apply_chat_template( + safe_messages, **template_kwargs + ) else: prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) prompt += "\nassistant:" @@ -1030,6 +1057,15 @@ def run_stream(): # exposes any non-KVCache entries or the probe failed. if not self._supports_system_kv_cache: cache_blocking_controls.append("non_kv_cache_class") + # The system-prefix probe (re-renders the conversation with two different + # user contents and compares the rendered strings) goes through + # ``tokenizer.apply_chat_template``. When the harmony rendering path is + # active the actual prompt is built by ``openai-harmony`` instead, so the + # probe and the prompt would diverge and the cache would never hit. + # Falling back to the uncached path keeps correctness without splitting + # the probe across both renderers. + if getattr(self, "use_harmony_rendering", False): + cache_blocking_controls.append("harmony_rendering") if cache_blocking_controls: logger.info( diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index 1de223f05..318e76e17 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -313,6 +313,16 @@ def _prepare_chat_messages( is_mllm = bool(getattr(engine, "is_mllm", False)) preserve_native = bool(getattr(engine, "preserve_native_tool_format", False)) + # Harmony rendering needs the structural ``tool_calls`` / ``role=tool`` + # shape to survive ``extract_multimodal_content`` — otherwise prior + # assistant tool calls reach ``render_messages()`` as ``[Calling tool: …]`` + # bracket text and the harmony renderer can't reconstruct the + # commentary channel. The flag is set by ``_detect_harmony_rendering()`` + # only when the harmony parser is active AND ``openai-harmony`` is + # importable, so non-harmony parsers and the no-extras install path see + # no change. + if bool(getattr(engine, "use_harmony_rendering", False)): + preserve_native = True if is_mllm: # For MLLM models, keep original messages with embedded images @@ -973,6 +983,7 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext: if _model_manager is None: engine = get_engine() engine.preserve_native_tool_format = _detect_native_tool_support() + engine.use_harmony_rendering = _detect_harmony_rendering() return RequestModelContext( model_name=_model_name or request_model, engine=engine ) @@ -983,6 +994,8 @@ async def _acquire_request_model(request_model: str) -> RequestModelContext: raise HTTPException(status_code=503, detail=str(exc)) from exc lease.engine.preserve_native_tool_format = _detect_native_tool_support() + + lease.engine.use_harmony_rendering = _detect_harmony_rendering() return RequestModelContext( model_name=request_model, engine=lease.engine, @@ -1213,6 +1226,7 @@ def _activate_engine(engine: BaseEngine | None) -> BaseEngine | None: _engine = engine if _engine is not None: _engine.preserve_native_tool_format = _detect_native_tool_support() + _engine.use_harmony_rendering = _detect_harmony_rendering() return _engine @@ -2758,6 +2772,42 @@ def _detect_native_tool_support() -> bool: return False +def _detect_harmony_rendering() -> bool: + """Detect whether the harmony rendering path should handle prompt building. + + Returns True when ALL of: + - ``--tool-call-parser`` is set to ``harmony`` or ``gpt-oss`` + - ``--enable-auto-tool-choice`` is on + - the optional ``openai-harmony`` Python package is importable + + The third condition keeps non-gpt-oss deployments free of an extra + runtime dependency: if the package isn't installed, the engine falls + back to the standard ``tokenizer.apply_chat_template`` path. The + HarmonyToolParser's existing text-flatten behavior also stays in force + in that fallback so the response side is unchanged. + """ + if not _enable_auto_tool_choice or not _tool_call_parser: + return False + try: + from .utils.harmony_render import ( + HAS_HARMONY, + is_harmony_parser_name, + ) + except ImportError: + return False + if not is_harmony_parser_name(_tool_call_parser): + return False + if not HAS_HARMONY: + logger.warning( + "tool-call-parser=%s requested but `openai-harmony` is not " + "installed; falling back to tokenizer.apply_chat_template. " + "`pip install openai-harmony` to enable harmony rendering.", + _tool_call_parser, + ) + return False + return True + + def _tool_choice_disabled(request: ChatCompletionRequest | None) -> bool: """Return True when tool_choice explicitly disables tool calling.""" if request is None: @@ -3116,6 +3166,7 @@ def load_model( # Set native tool format support on the engine (thread-safe via instance property) _engine.preserve_native_tool_format = _detect_native_tool_support() + _engine.use_harmony_rendering = _detect_harmony_rendering() if _engine.preserve_native_tool_format: logger.info(f"Native tool format enabled for parser: {_tool_call_parser}") diff --git a/vllm_mlx/utils/harmony_render.py b/vllm_mlx/utils/harmony_render.py new file mode 100644 index 000000000..d1a489ff9 --- /dev/null +++ b/vllm_mlx/utils/harmony_render.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Harmony-format prompt rendering for GPT-OSS via ``openai-harmony``. + +GPT-OSS models are trained with OpenAI's harmony wire format (channeled +``<|start|>assistant<|channel|>commentary ...<|call|>`` tool calls, +``<|start|>functions.X to=assistant<|channel|>commentary<|message|>...`` +tool results, etc.). Rendering harmony correctly from OpenAI-style chat +messages is delicate: prior assistant ``tool_calls`` must arrive at the +template as structural objects, not the bracket-text fallback that +``api.utils.extract_multimodal_content()`` produces for non-native parsers. + +This module bypasses the Jinja chat template entirely for harmony-active +engines: it converts the OpenAI-format ``messages`` (plus ``tools``) to an +``openai_harmony.Conversation`` and asks the library — the canonical +renderer maintained by OpenAI — to serialize it. That sidesteps both the +text-flattening upstream and any template-vs-training-format drift. + +The library is an optional dependency. ``HAS_HARMONY`` reflects import +success so the rest of the engine can fall back to ``apply_chat_template`` +when the package is absent. + +See https://github.com/waybarrios/vllm-mlx/issues/568 for the original +report and the patch shape Thump604 outlined. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +try: + import openai_harmony as _oh + + HAS_HARMONY = True +except ImportError: + _oh = None + HAS_HARMONY = False + + +def is_harmony_parser_name(parser_name: str | None) -> bool: + """Return True when the active --tool-call-parser is a harmony alias. + + ``HarmonyToolParser`` registers under both ``"harmony"`` and ``"gpt-oss"``. + """ + return parser_name in {"harmony", "gpt-oss"} + + +def _build_tools(tools: list[dict] | None) -> list[Any] | None: + if not tools or _oh is None: + return None + tool_descs: list[Any] = [] + for t in tools: + fn = t.get("function") or t + name = fn.get("name") + if not name: + continue + tool_descs.append( + _oh.ToolDescription.new( + name=name, + description=fn.get("description") or "", + parameters=fn.get("parameters") or {}, + ) + ) + return tool_descs or None + + +def _content_to_text(content: Any) -> str: + """Flatten OpenAI content (str | list[dict]) to plain text.""" + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + parts.append(item.get("text", "")) + elif isinstance(item, str): + parts.append(item) + return "\n".join(parts) + return str(content) + + +def _convert_message(msg: dict) -> list[Any]: + """Convert one OpenAI-format message to one or more ``openai_harmony.Message``. + + A single assistant turn can carry multiple tool_calls; harmony represents + each as its own commentary-channel message addressed to ``functions.X``. + Prior reasoning lives in an analysis-channel message that precedes the + tool calls. + """ + if _oh is None: + return [] + role = msg.get("role", "user") + content_text = _content_to_text(msg.get("content")) + out: list[Any] = [] + + if role == "system": + out.append(_oh.Message.from_role_and_content(_oh.Role.SYSTEM, content_text)) + elif role == "user": + out.append(_oh.Message.from_role_and_content(_oh.Role.USER, content_text)) + elif role == "tool": + # The tool name lives on a prior assistant tool_call; the caller is + # expected to thread it through ``tool_call_id``-to-name mapping + # before this conversion runs. The OpenAI schema doesn't carry the + # function name on tool messages directly, so we leave the name slot + # as the conventional ``functions.unknown`` if it isn't injected + # under the ``name`` key. + tool_name = msg.get("name") or "functions.unknown" + if not tool_name.startswith("functions."): + tool_name = f"functions.{tool_name}" + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.TOOL, name=tool_name), + content=[_oh.TextContent(text=content_text)], + channel="commentary", + recipient="assistant", + ) + ) + elif role == "assistant": + thinking = msg.get("thinking") or msg.get("reasoning_content") + tool_calls = msg.get("tool_calls") or [] + # Reasoning (analysis channel) — only renders when there are tool_calls + # to follow; the harmony chat template otherwise drops it for prior + # turns (matches gpt-oss training). + if thinking and tool_calls: + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=str(thinking))], + channel="analysis", + ) + ) + # User-visible final-channel content + if content_text and not tool_calls: + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=content_text)], + channel="final", + ) + ) + # Tool calls + for tc in tool_calls: + fn = tc.get("function") or tc + name = fn.get("name", "unknown") + args = fn.get("arguments") + if isinstance(args, dict): + args_text = json.dumps(args, ensure_ascii=False) + elif args is None: + args_text = "{}" + else: + args_text = str(args) + out.append( + _oh.Message( + author=_oh.Author.new(_oh.Role.ASSISTANT, name=None), + content=[_oh.TextContent(text=args_text)], + channel="commentary", + recipient=f"functions.{name}", + content_type="json", + ) + ) + elif role == "developer": + out.append(_oh.Message.from_role_and_content(_oh.Role.DEVELOPER, content_text)) + # Any other role is silently dropped (matches existing chat-template behavior) + return out + + +def _resolve_tool_names(messages: list[dict]) -> list[dict]: + """Stamp ``name=functions.X`` on each ``role=tool`` message by tracing back + the most recent assistant ``tool_call_id`` -> function name.""" + by_call_id: dict[str, str] = {} + out: list[dict] = [] + for m in messages: + if not isinstance(m, dict): + out.append(m) + continue + if m.get("role") == "assistant": + for tc in m.get("tool_calls") or []: + if not isinstance(tc, dict): + continue + tc_id = tc.get("id") + fn = tc.get("function") or {} + name = fn.get("name") + if tc_id and name: + by_call_id[tc_id] = name + out.append(m) + continue + if m.get("role") == "tool": + new_m = dict(m) + if "name" not in new_m and (tc_id := new_m.get("tool_call_id")): + name = by_call_id.get(tc_id) + if name: + new_m["name"] = name + out.append(new_m) + continue + out.append(m) + return out + + +def render_messages( + messages: list[dict], + tools: list[dict] | None = None, + reasoning_effort: str | None = None, +) -> str: + """Render OpenAI-format messages as a harmony-format prompt string. + + Raises ``RuntimeError`` if ``openai-harmony`` is not importable; callers + should pre-check with :data:`HAS_HARMONY` and fall back to + ``tokenizer.apply_chat_template`` when False. + + Args: + messages: OpenAI chat-completions messages. + tools: OpenAI-format tools list (each item ``{"type":"function","function":{...}}``). + reasoning_effort: ``"low"``, ``"medium"``, or ``"high"``. Defaults to medium. + + Returns: + Decoded harmony prompt with the trailing ``<|start|>assistant`` + marker ready for the model to begin generation. + """ + if not HAS_HARMONY: + raise RuntimeError( + "openai-harmony is not installed. `pip install openai-harmony` or " + "fall back to tokenizer.apply_chat_template." + ) + + resolved = _resolve_tool_names(messages) + + # Pull system + developer messages to the head; gpt-oss expects + # ``<|start|>system|...|<|end|><|start|>developer|...|<|end|>`` before + # any user/assistant turn. + system_msgs: list[dict] = [] + developer_msgs: list[dict] = [] + other_msgs: list[dict] = [] + for m in resolved: + if not isinstance(m, dict): + other_msgs.append(m) + continue + role = m.get("role") + if role == "system": + system_msgs.append(m) + elif role == "developer": + developer_msgs.append(m) + else: + other_msgs.append(m) + + h_messages: list[Any] = [] + tool_descs = _build_tools(tools) + + # 1. System block (inject default if caller didn't provide one). + if system_msgs: + for m in system_msgs: + h_messages.extend(_convert_message(m)) + else: + sys_content = _oh.SystemContent.new() + if reasoning_effort: + try: + level = getattr(_oh.ReasoningEffort, reasoning_effort.upper()) + sys_content = sys_content.with_reasoning_effort(level) + except Exception: # noqa: BLE001 + pass + h_messages.append( + _oh.Message.from_role_and_content(_oh.Role.SYSTEM, sys_content) + ) + + # 2. Developer block (tool schema). If caller passed an explicit + # developer message, render that; otherwise synthesize from tools. + if developer_msgs: + for m in developer_msgs: + h_messages.extend(_convert_message(m)) + elif tool_descs: + dev_content = _oh.DeveloperContent.new().with_function_tools(tool_descs) + h_messages.append( + _oh.Message.from_role_and_content(_oh.Role.DEVELOPER, dev_content) + ) + + # 3. Everything else, in original order. + for m in other_msgs: + if isinstance(m, dict): + h_messages.extend(_convert_message(m)) + + conv = _oh.Conversation.from_messages(h_messages) + enc = _oh.load_harmony_encoding(_oh.HarmonyEncodingName.HARMONY_GPT_OSS) + token_ids = enc.render_conversation_for_completion( + conv, next_turn_role=_oh.Role.ASSISTANT + ) + return enc.decode(token_ids)