diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index 679c8033ec..36ff45cfdc 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -61,6 +61,7 @@ Currently, supported models include: - ``QwQ-32B-Preview``, ``QwQ-32B`` - ``marco-o1`` - ``gemma-it``, ``gemma-2-it`` +- ``gemma-3-it``, ``gemma-3-27b-it``, ``gemma-3-12b-it``, ``gemma-3-4b-it``, ``gemma-3-1b-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` - ``minicpm3-4b`` diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 95b8d6a1e5..95daecd667 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -5786,6 +5786,239 @@ "<start_of_turn>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "gemma-3-it", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 1, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-1b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-4b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-12b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 27, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-3-27b-it" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 1, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-1b-it-GGUF", + "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-4b-it-GGUF", + "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 12, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-12b-it-GGUF", + "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 27, + "quantizations": [ + "IQ2_M", + "IQ3_M", + "IQ3_XS", + "IQ3_XXS", + "IQ4_NL", + "IQ4_XS", + "Q2_K", + "Q2_K_L", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-27b-it-GGUF", + "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-1b-it-{quantization}" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-4b-it-{quantization}" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-12b-it-{quantization}" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-27b-it-{quantization}" + } + ], + "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", + "stop_token_ids": [ + 1, + 105, + 106 + ], + "stop": [ + "<eos>", + "<end_of_turn>", + "<start_of_turn>" + ] + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 4428f849cd..405a2fe22b 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -3738,6 +3738,214 @@ "<start_of_turn>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "gemma-3-it", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 1, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-1b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-4b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-12b-it", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 27, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "LLM-Research/gemma-3-27b-it", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 1, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-1b-it-GGUF", + "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-4b-it-GGUF", + "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 12, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-12b-it-GGUF", + "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 27, + "quantizations": [ + "Q2_K", + "Q3_K_L", + "Q3_K_M", + "Q3_K_S", + "Q4_K_L", + "Q4_K_M", + "Q4_K_S", + "Q5_K_L", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q6_K_L", + "Q8_0", + "bf16" + ], + "model_id": "bartowski/google_gemma-3-27b-it-GGUF", + "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 1, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-1b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 4, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-4b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-12b-it-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 27, + "quantizations": [ + "4bit", + "6bit", + "8bit", + "fp16" + ], + "model_id": "mlx-community/gemma-3-27b-it-{quantization}", + "model_hub": "modelscope" + } + ], + "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", + "stop_token_ids": [ + 1, + 106 + ], + "stop": [ + "<eos>", + "<end_of_turn>", + "<start_of_turn>" + ] + }, { "version":1, "context_length":2048, diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index ef7a585d12..b3efbda8cb 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -148,11 +148,15 @@ def _load_model(self, **kwargs): self._max_kv_size = kwargs.get("max_kv_size", None) self._prompt_cache = PromptCache() - return load( + model, tokenizer = load( self.model_path, tokenizer_config=tokenizer_config, model_config=self._model_config, ) + if stop_token_ids := self.model_family.stop_token_ids: + for stop_token_id in stop_token_ids: + tokenizer.add_eos_token(stop_token_id) + return model, tokenizer def load(self): reasoning_content = self._model_config.pop("reasoning_content") diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 776663a03e..3e01b71395 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -94,6 +94,7 @@ class SGLANGGenerateConfig(TypedDict, total=False): "mixtral-instruct-v0.1", "gemma-it", "gemma-2-it", + "gemma-3-it", "deepseek-v2.5", "deepseek-v2-chat", "deepseek-v2-chat-0628", diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 3464b1a421..ba87cf9301 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -216,6 +216,9 @@ class VLLMGenerateConfig(TypedDict, total=False): if VLLM_INSTALLED and vllm.__version__ >= "0.7.3": VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m") +if VLLM_INSTALLED and vllm.__version__ >= "0.8.0": + VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-it") + class VLLMModel(LLM): def __init__(