From 243669ed7c8bb43d261f990fe39f6a38c0d08d0f Mon Sep 17 00:00:00 2001
From: kaiyuan Zhang <zky21@mails.tsinghua.edu.cn>
Date: Mon, 17 Mar 2025 02:59:54 +0000
Subject: [PATCH 1/5] FEAT: Support gemma-3

---
 doc/source/getting_started/installation.rst   |   1 +
 xinference/model/llm/llm_family.json          | 316 ++++++++++++++++++
 .../model/llm/llm_family_modelscope.json      | 160 +++++++++
 xinference/model/llm/sglang/core.py           |   1 +
 4 files changed, 478 insertions(+)

diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index 679c8033ec..36ff45cfdc 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -61,6 +61,7 @@ Currently, supported models include:
 - ``QwQ-32B-Preview``, ``QwQ-32B``
 - ``marco-o1``
 - ``gemma-it``, ``gemma-2-it``
+- ``gemma-3-it``, ``gemma-3-27b-it``, ``gemma-3-12b-it``, ``gemma-3-4b-it``, ``gemma-3-1b-it``
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``
 - ``minicpm3-4b``
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 95b8d6a1e5..5b13efc2b4 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -5786,6 +5786,322 @@
       "<start_of_turn>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "gemma-3-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-3-1b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-3-4b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-3-12b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-3-27b-it"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/gemma-3-1b-it-GGUF",
+        "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/gemma-3-4b-it-GGUF",
+        "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/gemma-3-12b-it-GGUF",
+        "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/gemma-3-27b-it-GGUF",
+        "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "6bit"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-6bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "8bit"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-fp16"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "6bit"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-6bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "8bit"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-fp16"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "6bit"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-6bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "8bit"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-fp16"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "6bit"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-6bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "8bit"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-fp16"
+      }
+    ],
+    "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "stop_token_ids": [
+      1,
+      106
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 4428f849cd..549883485a 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -3738,6 +3738,166 @@
       "<start_of_turn>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "gemma-3-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-1b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-4b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-12b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-27b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
+        "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
+        "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
+        "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
+        "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "stop_token_ids": [
+      1,
+      106
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
+  },
   {
     "version":1,
     "context_length":2048,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 776663a03e..3e01b71395 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -94,6 +94,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     "mixtral-instruct-v0.1",
     "gemma-it",
     "gemma-2-it",
+    "gemma-3-it",
     "deepseek-v2.5",
     "deepseek-v2-chat",
     "deepseek-v2-chat-0628",

From be2fe0b2950621bd2bd5ff196634e9f95c85315a Mon Sep 17 00:00:00 2001
From: Jun-Howie <JunHowie@aliyun.com>
Date: Wed, 19 Mar 2025 17:04:27 +0800
Subject: [PATCH 2/5] fix: gguf model_id

(cherry picked from commit 67be8e231d18fc25bcbdba29ea169190e9cf5b9d)
---
 xinference/model/llm/llm_family.json            | 16 ++++++++--------
 xinference/model/llm/llm_family_modelscope.json |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 5b13efc2b4..cddb8cc66d 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -5866,8 +5866,8 @@
           "Q8_0",
           "bf16"
         ],
-        "model_id": "bartowski/gemma-3-1b-it-GGUF",
-        "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf"
+        "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -5897,8 +5897,8 @@
           "Q8_0",
           "bf16"
         ],
-        "model_id": "bartowski/gemma-3-4b-it-GGUF",
-        "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf"
+        "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -5928,8 +5928,8 @@
           "Q8_0",
           "bf16"
         ],
-        "model_id": "bartowski/gemma-3-12b-it-GGUF",
-        "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf"
+        "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -5959,8 +5959,8 @@
           "Q8_0",
           "bf16"
         ],
-        "model_id": "bartowski/gemma-3-27b-it-GGUF",
-        "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf"
+        "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf"
       },
       {
         "model_format": "mlx",
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 549883485a..86affa51ae 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -3814,7 +3814,7 @@
           "bf16"
         ],
         "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
-        "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf",
+        "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -3837,7 +3837,7 @@
           "bf16"
         ],
         "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
-        "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf",
+        "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -3860,7 +3860,7 @@
           "bf16"
         ],
         "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
-        "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf",
+        "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -3883,7 +3883,7 @@
           "bf16"
         ],
         "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
-        "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf",
+        "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
         "model_hub": "modelscope"
       }
     ],

From 6a33ed64ec31f6844e03c01e14f47ee40c6876a3 Mon Sep 17 00:00:00 2001
From: Jun-Howie <JunHowie@aliyun.com>
Date: Wed, 19 Mar 2025 17:19:53 +0800
Subject: [PATCH 3/5] fix mlx

---
 xinference/model/llm/llm_family.json          | 125 +++---------------
 .../model/llm/llm_family_modelscope.json      |  48 +++++++
 xinference/model/llm/mlx/core.py              |   6 +-
 3 files changed, 74 insertions(+), 105 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index cddb8cc66d..37bb0a08d8 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -5966,134 +5966,51 @@
         "model_format": "mlx",
         "model_size_in_billions": 1,
         "quantizations": [
-          "4bit"
-        ],
-        "model_id": "mlx-community/gemma-3-1b-it-4bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 1,
-        "quantizations": [
-          "6bit"
-        ],
-        "model_id": "mlx-community/gemma-3-1b-it-6bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 1,
-        "quantizations": [
-          "8bit"
-        ],
-        "model_id": "mlx-community/gemma-3-1b-it-8bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 1,
-        "quantizations": [
-          "None"
-        ],
-        "model_id": "mlx-community/gemma-3-1b-it-fp16"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 4,
-        "quantizations": [
-          "4bit"
-        ],
-        "model_id": "mlx-community/gemma-3-4b-it-4bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 4,
-        "quantizations": [
-          "6bit"
-        ],
-        "model_id": "mlx-community/gemma-3-4b-it-6bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 4,
-        "quantizations": [
-          "8bit"
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
         ],
-        "model_id": "mlx-community/gemma-3-4b-it-8bit"
+        "model_id": "mlx-community/gemma-3-1b-it-{quantization}"
       },
       {
         "model_format": "mlx",
         "model_size_in_billions": 4,
         "quantizations": [
-          "None"
-        ],
-        "model_id": "mlx-community/gemma-3-4b-it-fp16"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 12,
-        "quantizations": [
-          "4bit"
-        ],
-        "model_id": "mlx-community/gemma-3-12b-it-4bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 12,
-        "quantizations": [
-          "6bit"
-        ],
-        "model_id": "mlx-community/gemma-3-12b-it-6bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 12,
-        "quantizations": [
-          "8bit"
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
         ],
-        "model_id": "mlx-community/gemma-3-12b-it-8bit"
+        "model_id": "mlx-community/gemma-3-4b-it-{quantization}"
       },
       {
         "model_format": "mlx",
         "model_size_in_billions": 12,
         "quantizations": [
-          "None"
-        ],
-        "model_id": "mlx-community/gemma-3-12b-it-fp16"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 27,
-        "quantizations": [
-          "4bit"
-        ],
-        "model_id": "mlx-community/gemma-3-27b-it-4bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 27,
-        "quantizations": [
-          "6bit"
-        ],
-        "model_id": "mlx-community/gemma-3-27b-it-6bit"
-      },
-      {
-        "model_format": "mlx",
-        "model_size_in_billions": 27,
-        "quantizations": [
-          "8bit"
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
         ],
-        "model_id": "mlx-community/gemma-3-27b-it-8bit"
+        "model_id": "mlx-community/gemma-3-12b-it-{quantization}"
       },
       {
         "model_format": "mlx",
         "model_size_in_billions": 27,
         "quantizations": [
-          "None"
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
         ],
-        "model_id": "mlx-community/gemma-3-27b-it-fp16"
+        "model_id": "mlx-community/gemma-3-27b-it-{quantization}"
       }
     ],
     "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
     "stop_token_ids": [
       1,
+      105,
       106
     ],
     "stop": [
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 86affa51ae..405a2fe22b 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -3885,6 +3885,54 @@
         "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
         "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-{quantization}",
+        "model_hub": "modelscope"
       }
     ],
     "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index ef7a585d12..b3efbda8cb 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -148,11 +148,15 @@ def _load_model(self, **kwargs):
         self._max_kv_size = kwargs.get("max_kv_size", None)
         self._prompt_cache = PromptCache()
 
-        return load(
+        model, tokenizer = load(
             self.model_path,
             tokenizer_config=tokenizer_config,
             model_config=self._model_config,
         )
+        if stop_token_ids := self.model_family.stop_token_ids:
+            for stop_token_id in stop_token_ids:
+                tokenizer.add_eos_token(stop_token_id)
+        return model, tokenizer
 
     def load(self):
         reasoning_content = self._model_config.pop("reasoning_content")

From cf43f0f3373f66ef2ed16f6abba493b55b37f772 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Thu, 20 Mar 2025 11:32:19 +0800
Subject: [PATCH 4/5] Update xinference/model/llm/llm_family.json

Co-authored-by: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
---
 xinference/model/llm/llm_family.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 37bb0a08d8..95daecd667 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -5840,7 +5840,7 @@
       },
       {
         "model_format": "ggufv2",
-        "model_size_in_billions": 2,
+        "model_size_in_billions": 1,
         "quantizations": [
           "IQ2_M",
           "IQ3_M",

From f7559a2c4663ce92b6312f733365a3e90600b52b Mon Sep 17 00:00:00 2001
From: qinxuye <qinxuye@gmail.com>
Date: Thu, 20 Mar 2025 17:29:43 +0800
Subject: [PATCH 5/5] support vllm

---
 xinference/model/llm/vllm/core.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 3464b1a421..ba87cf9301 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -216,6 +216,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
 if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
 
+if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-it")
+
 
 class VLLMModel(LLM):
     def __init__(