From c16c8d80c35d16685298c6f46498f1a560381d41 Mon Sep 17 00:00:00 2001
From: Wallace Wadge <wallace.wadge@cepton.com>
Date: Thu, 21 May 2026 09:57:40 +0200
Subject: [PATCH] Scope quantization-config served-id inference to Qwen3.6-27B

The config.json quantization layout (Q4 weights with a Q8 head, flat Q8,
etc.) is shared across many MLX builds, so it cannot identify the model
family on its own. _public_model_id_from_metadata was using it as the
last-resort signal and unconditionally returning one of the
mtplx-qwen36-27b-* ids, so a third-party artifact like
Qwen3.6-35B-A3B-4bit-MTPLX-Optimized-Speed was being served as
mtplx-qwen36-27b-optimized-quality on /v1/models.

Gate that fallback on _public_model_id_from_name already identifying the
folder as a Qwen3.6-27B MTPLX variant. Explicit metadata fields
(public_model_id, served_model_id, model_id, precision_variant,
artifact_role, verified_on.model) still take precedence, so the existing
"runtime metadata before folder name" behavior is preserved. Unknown
folders now fall through to the basename-sanitized id, e.g.
qwen3.6-35b-a3b-4bit-mtplx-optimized-speed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mtplx/default_models.py      | 10 ++++++++++
 tests/test_default_models.py | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/mtplx/default_models.py b/mtplx/default_models.py
index a0d5e02..877a973 100644
--- a/mtplx/default_models.py
+++ b/mtplx/default_models.py
@@ -209,6 +209,16 @@ def _public_model_id_from_metadata(path: Path) -> str | None:
             if inferred:
                 return inferred
 
+    # The config.json quantization layout (Q4 weights with a Q8 head, flat Q8,
+    # etc.) is shared by many MLX builds, so it can only distinguish the
+    # Qwen3.6-27B MTPLX Speed/Quality split — not the model family. Skip this
+    # refinement for folders that don't already identify as a Qwen3.6-27B MTPLX
+    # artifact; otherwise a third-party model like
+    # `Qwen3.6-35B-A3B-4bit-MTPLX-Optimized-Speed` would silently be served as
+    # `mtplx-qwen36-27b-optimized-quality`.
+    if _public_model_id_from_name(str(path)) is None:
+        return None
+
     config = _read_json(path / "config.json")
     quantization = config.get("quantization") or config.get("quantization_config")
     if isinstance(quantization, dict):
diff --git a/tests/test_default_models.py b/tests/test_default_models.py
index 1b73191..33f5ca5 100644
--- a/tests/test_default_models.py
+++ b/tests/test_default_models.py
@@ -294,3 +294,28 @@ def test_public_model_id_for_ref_maps_unknown_local_name_to_sanitized_id():
         public_model_id_for_ref("/tmp/My Custom Local Model!")
         == "my-custom-local-model"
     )
+
+
+def test_public_model_id_for_ref_third_party_q4_q8_keeps_folder_name(tmp_path):
+    # Regression: a third-party Qwen3.6-35B-A3B build with the same Q4-weights/
+    # Q8-head config layout as the Qwen3.6-27B Quality artifact must not be
+    # served as `mtplx-qwen36-27b-optimized-quality`.
+    model = tmp_path / "Qwen3.6-35B-A3B-4bit-MTPLX-Optimized-Speed"
+    model.mkdir()
+    (model / "config.json").write_text(
+        json.dumps(
+            {
+                "quantization": {
+                    "bits": 4,
+                    "language_model.model.layers.0.mlp.down_proj": {"bits": 8},
+                    "language_model.model.layers.0.linear_attn.in_proj_qkv": {"bits": 8},
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    assert (
+        public_model_id_for_ref(model)
+        == "qwen3.6-35b-a3b-4bit-mtplx-optimized-speed"
+    )