From 243669ed7c8bb43d261f990fe39f6a38c0d08d0f Mon Sep 17 00:00:00 2001 From: kaiyuan Zhang <zky21@mails.tsinghua.edu.cn> Date: Mon, 17 Mar 2025 02:59:54 +0000 Subject: [PATCH 1/5] FEAT: Support gemma-3 --- doc/source/getting_started/installation.rst | 1 + xinference/model/llm/llm_family.json | 316 ++++++++++++++++++ .../model/llm/llm_family_modelscope.json | 160 +++++++++ xinference/model/llm/sglang/core.py | 1 + 4 files changed, 478 insertions(+) diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index 679c8033ec..36ff45cfdc 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -61,6 +61,7 @@ Currently, supported models include: - ``QwQ-32B-Preview``, ``QwQ-32B`` - ``marco-o1`` - ``gemma-it``, ``gemma-2-it`` +- ``gemma-3-it``, ``gemma-3-27b-it``, ``gemma-3-12b-it``, ``gemma-3-4b-it``, ``gemma-3-1b-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` - ``minicpm3-4b`` diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 95b8d6a1e5..5b13efc2b4 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -5786,6 +5786,322 @@ "<start_of_turn>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "gemma-3-it", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 1, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-1b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-4b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-12b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 27, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-27b-it" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 2, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/gemma-3-1b-it-GGUF", + "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/gemma-3-4b-it-GGUF", + "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 12, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/gemma-3-12b-it-GGUF", + "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 27, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/gemma-3-27b-it-GGUF", + "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "4bit" + ], + "model_id": "mlx-community/gemma-3-1b-it-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "6bit" + ], + "model_id": "mlx-community/gemma-3-1b-it-6bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "8bit" + ], + "model_id": "mlx-community/gemma-3-1b-it-8bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "None" + ], + "model_id": "mlx-community/gemma-3-1b-it-fp16" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "4bit" + ], + "model_id": "mlx-community/gemma-3-4b-it-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "6bit" + ], + "model_id": "mlx-community/gemma-3-4b-it-6bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "8bit" + ], + "model_id": "mlx-community/gemma-3-4b-it-8bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "None" + ], + "model_id": "mlx-community/gemma-3-4b-it-fp16" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "4bit" + ], + "model_id": "mlx-community/gemma-3-12b-it-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "6bit" + ], + "model_id": "mlx-community/gemma-3-12b-it-6bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "8bit" + ], + "model_id": "mlx-community/gemma-3-12b-it-8bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "None" + ], + "model_id": "mlx-community/gemma-3-12b-it-fp16" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "4bit" + ], + "model_id": "mlx-community/gemma-3-27b-it-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "6bit" + ], + "model_id": "mlx-community/gemma-3-27b-it-6bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "8bit" + ], + "model_id": "mlx-community/gemma-3-27b-it-8bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "None" + ], + "model_id": "mlx-community/gemma-3-27b-it-fp16" + } + ], + "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", + "stop_token_ids": [ + 1, + 106 + ], + "stop": [ + "<eos>", + "<end_of_turn>", + "<start_of_turn>" + ] + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 4428f849cd..549883485a 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -3738,6 +3738,166 @@ "<start_of_turn>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "gemma-3-it", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 1, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-1b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-4b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-12b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 27, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-27b-it", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 1, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-1b-it-GGUF", + "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-4b-it-GGUF", + "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 12, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-12b-it-GGUF", + "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 27, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-27b-it-GGUF", + "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf", + "model_hub": "modelscope" + } + ], + "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", + "stop_token_ids": [ + 1, + 106 + ], + "stop": [ + "<eos>", + "<end_of_turn>", + "<start_of_turn>" + ] + }, { "version":1, "context_length":2048, diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 776663a03e..3e01b71395 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -94,6 +94,7 @@ class SGLANGGenerateConfig(TypedDict, total=False): "mixtral-instruct-v0.1", "gemma-it", "gemma-2-it", + "gemma-3-it", "deepseek-v2.5", "deepseek-v2-chat", "deepseek-v2-chat-0628", From be2fe0b2950621bd2bd5ff196634e9f95c85315a Mon Sep 17 00:00:00 2001 From: Jun-Howie <JunHowie@aliyun.com> Date: Wed, 19 Mar 2025 17:04:27 +0800 Subject: [PATCH 2/5] fix: gguf model_id (cherry picked from commit 67be8e231d18fc25bcbdba29ea169190e9cf5b9d) --- xinference/model/llm/llm_family.json | 16 ++++++++-------- xinference/model/llm/llm_family_modelscope.json | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 5b13efc2b4..cddb8cc66d 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -5866,8 +5866,8 @@ "Q8_0", "bf16" ], - "model_id": "bartowski/gemma-3-1b-it-GGUF", - "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf" + "model_id": "bartowski/google_gemma-3-1b-it-GGUF", + "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -5897,8 +5897,8 @@ "Q8_0", "bf16" ], - "model_id": "bartowski/gemma-3-4b-it-GGUF", - "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf" + "model_id": "bartowski/google_gemma-3-4b-it-GGUF", + "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -5928,8 +5928,8 @@ "Q8_0", "bf16" ], - "model_id": "bartowski/gemma-3-12b-it-GGUF", - "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf" + "model_id": "bartowski/google_gemma-3-12b-it-GGUF", + "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -5959,8 +5959,8 @@ "Q8_0", "bf16" ], - "model_id": "bartowski/gemma-3-27b-it-GGUF", - "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf" + "model_id": "bartowski/google_gemma-3-27b-it-GGUF", + "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf" }, { "model_format": "mlx", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 549883485a..86affa51ae 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -3814,7 +3814,7 @@ "bf16" ], "model_id": "bartowski/google_gemma-3-1b-it-GGUF", - "model_file_name_template": "gemma-3-1b-it-{quantization}.gguf", + "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -3837,7 +3837,7 @@ "bf16" ], "model_id": "bartowski/google_gemma-3-4b-it-GGUF", - "model_file_name_template": "gemma-3-4b-it-{quantization}.gguf", + "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -3860,7 +3860,7 @@ "bf16" ], "model_id": "bartowski/google_gemma-3-12b-it-GGUF", - "model_file_name_template": "gemma-3-12b-it-{quantization}.gguf", + "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -3883,7 +3883,7 @@ "bf16" ], "model_id": "bartowski/google_gemma-3-27b-it-GGUF", - "model_file_name_template": "gemma-3-27b-it-{quantization}.gguf", + "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf", "model_hub": "modelscope" } ], From 6a33ed64ec31f6844e03c01e14f47ee40c6876a3 Mon Sep 17 00:00:00 2001 From: Jun-Howie <JunHowie@aliyun.com> Date: Wed, 19 Mar 2025 17:19:53 +0800 Subject: [PATCH 3/5] fix mlx --- xinference/model/llm/llm_family.json | 125 +++--------------- .../model/llm/llm_family_modelscope.json | 48 +++++++ xinference/model/llm/mlx/core.py | 6 +- 3 files changed, 74 insertions(+), 105 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cddb8cc66d..37bb0a08d8 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -5966,134 +5966,51 @@ "model_format": "mlx", "model_size_in_billions": 1, "quantizations": [ - "4bit" - ], - "model_id": "mlx-community/gemma-3-1b-it-4bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 1, - "quantizations": [ - "6bit" - ], - "model_id": "mlx-community/gemma-3-1b-it-6bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 1, - "quantizations": [ - "8bit" - ], - "model_id": "mlx-community/gemma-3-1b-it-8bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 1, - "quantizations": [ - "None" - ], - "model_id": "mlx-community/gemma-3-1b-it-fp16" - }, - { - "model_format": "mlx", - "model_size_in_billions": 4, - "quantizations": [ - "4bit" - ], - "model_id": "mlx-community/gemma-3-4b-it-4bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 4, - "quantizations": [ - "6bit" - ], - "model_id": "mlx-community/gemma-3-4b-it-6bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 4, - "quantizations": [ - "8bit" + "4bit", + "6bit", + "8bit", + "fp16" ], - "model_id": "mlx-community/gemma-3-4b-it-8bit" + "model_id": "mlx-community/gemma-3-1b-it-{quantization}" }, { "model_format": "mlx", "model_size_in_billions": 4, "quantizations": [ - "None" - ], - "model_id": "mlx-community/gemma-3-4b-it-fp16" - }, - { - "model_format": "mlx", - "model_size_in_billions": 12, - "quantizations": [ - "4bit" - ], - "model_id": "mlx-community/gemma-3-12b-it-4bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 12, - "quantizations": [ - "6bit" - ], - "model_id": "mlx-community/gemma-3-12b-it-6bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 12, - "quantizations": [ - "8bit" + "4bit", + "6bit", + "8bit", + "fp16" ], - "model_id": "mlx-community/gemma-3-12b-it-8bit" + "model_id": "mlx-community/gemma-3-4b-it-{quantization}" }, { "model_format": "mlx", "model_size_in_billions": 12, "quantizations": [ - "None" - ], - "model_id": "mlx-community/gemma-3-12b-it-fp16" - }, - { - "model_format": "mlx", - "model_size_in_billions": 27, - "quantizations": [ - "4bit" - ], - "model_id": "mlx-community/gemma-3-27b-it-4bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 27, - "quantizations": [ - "6bit" - ], - "model_id": "mlx-community/gemma-3-27b-it-6bit" - }, - { - "model_format": "mlx", - "model_size_in_billions": 27, - "quantizations": [ - "8bit" + "4bit", + "6bit", + "8bit", + "fp16" ], - "model_id": "mlx-community/gemma-3-27b-it-8bit" + "model_id": "mlx-community/gemma-3-12b-it-{quantization}" }, { "model_format": "mlx", "model_size_in_billions": 27, "quantizations": [ - "None" + "4bit", + "6bit", + "8bit", + "fp16" ], - "model_id": "mlx-community/gemma-3-27b-it-fp16" + "model_id": "mlx-community/gemma-3-27b-it-{quantization}" } ], "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", "stop_token_ids": [ 1, + 105, 106 ], "stop": [ diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 86affa51ae..405a2fe22b 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -3885,6 +3885,54 @@ "model_id": "bartowski/google_gemma-3-27b-it-GGUF", "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf", "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-1b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-4b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-12b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-27b-it-{quantization}", + "model_hub": "modelscope" } ], "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index ef7a585d12..b3efbda8cb 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -148,11 +148,15 @@ def _load_model(self, **kwargs): self._max_kv_size = kwargs.get("max_kv_size", None) self._prompt_cache = PromptCache() - return load( + model, tokenizer = load( self.model_path, tokenizer_config=tokenizer_config, model_config=self._model_config, ) + if stop_token_ids := self.model_family.stop_token_ids: + for stop_token_id in stop_token_ids: + tokenizer.add_eos_token(stop_token_id) + return model, tokenizer def load(self): reasoning_content = self._model_config.pop("reasoning_content") From cf43f0f3373f66ef2ed16f6abba493b55b37f772 Mon Sep 17 00:00:00 2001 From: Xuye Qin <qinxuye@gmail.com> Date: Thu, 20 Mar 2025 11:32:19 +0800 Subject: [PATCH 4/5] Update xinference/model/llm/llm_family.json Co-authored-by: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com> --- xinference/model/llm/llm_family.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 37bb0a08d8..95daecd667 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -5840,7 +5840,7 @@ }, { "model_format": "ggufv2", - "model_size_in_billions": 2, + "model_size_in_billions": 1, "quantizations": [ "IQ2_M", "IQ3_M", From f7559a2c4663ce92b6312f733365a3e90600b52b Mon Sep 17 00:00:00 2001 From: qinxuye <qinxuye@gmail.com> Date: Thu, 20 Mar 2025 17:29:43 +0800 Subject: [PATCH 5/5] support vllm --- xinference/model/llm/vllm/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 3464b1a421..ba87cf9301 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -216,6 +216,9 @@ class VLLMGenerateConfig(TypedDict, total=False): if VLLM_INSTALLED and vllm.__version__ >= "0.7.3": VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m") +if VLLM_INSTALLED and vllm.__version__ >= "0.8.0": + VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-it") + class VLLMModel(LLM): def __init__(