xorbitsai · OliverBryant · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 20, 2025
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-08-02 23:15+0800\n"
+"POT-Creation-Date: 2025-11-21 18:03+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -17,7 +17,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.14.0\n"
+"Generated-By: Babel 2.17.0\n"
 
 #: ../../source/user_guide/launch.rst:5
 msgid "Model Launching Instructions"
@@ -46,11 +46,115 @@ msgstr ""
 "两张 GPU 上。Xinference 会自动进行负载均衡，确保请求均匀分配到多张卡上。"
 "用户看到的仍是一个模型，这大大提升了整体资源利用率。"
 
-#: ../../source/user_guide/launch.rst:18
+#: ../../source/user_guide/launch.rst:17
+msgid "Traditional Multi-Instance Deployment："
+msgstr "旧版本多实例部署："
+
+#: ../../source/user_guide/launch.rst:19
+msgid ""
+"When you have multiple GPU cards, each capable of hosting one model "
+"instance, you can set the number of instances equal to the number of "
+"GPUs. For example:"
+msgstr ""
+"当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为"
+"等于GPU数量。例如:"
+
+#: ../../source/user_guide/launch.rst:21
+msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
+msgstr "2张GPU，2个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:22
+msgid "4 GPUs, 4 instances: Each GPU runs one model instance"
+msgstr "4张GPU，4个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:26
+msgid "Introduce a new environment variable:"
+msgstr "引入一个新的环境变量:"
+
+#: ../../source/user_guide/launch.rst:32
+msgid ""
+"Control whether to enable the single GPU multi-copy feature Default "
+"value: 1"
+msgstr "控制是否启用单GPU多副本功能，默认值：1"
+
+#: ../../source/user_guide/launch.rst:36
+msgid "Key Features: Four new launch strategies"
+msgstr "关键特性：四个新的启动策略"
+
+#: ../../source/user_guide/launch.rst:39
+msgid "Memory-Aware Strategy"
+msgstr "内存感知策略"
+
+#: ../../source/user_guide/launch.rst:41
+msgid "**Strategy Name:** `memory_aware` (default)"
+msgstr "**策略名称:** `memory_aware` (默认)"
+
+#: ../../source/user_guide/launch.rst:43
+msgid ""
+"**Description:** Advanced memory-aware allocation with intelligent GPU "
+"reuse. Estimates model memory requirements using formula: `model_size(GB)"
+" × 1024 × 1.5`, adjusts for quantization, and checks actual GPU "
+"availability before allocation."
+msgstr ""
+"**描述:** 先进的内存感知分配，支持智能GPU复用。"
+"使用公式 `模型大小(GB) × 1024 × 1.5` 估算模型内存需求，"
+"根据量化级别调整，并在分配前检查实际GPU可用性。"
+
+#: ../../source/user_guide/launch.rst:49
+msgid "Packing-First Strategy"
+msgstr "打包优先策略"
+
+#: ../../source/user_guide/launch.rst:51
+msgid "**Strategy Name:** `packing_first`"
+msgstr "**策略名称:** `packing_first`"
+
+#: ../../source/user_guide/launch.rst:53
+msgid ""
+"**Description:** Prioritizes filling GPUs with the most available memory "
+"before moving to the next. Optimizes for GPU utilization by consolidating"
+" instances."
+msgstr ""
+"**描述:** 优先填满可用内存最多的GPU，然后再使用下一个。通过整合实例来优化GPU利用率。"
+
+#: ../../source/user_guide/launch.rst:58
+msgid "Spread-First Strategy"
+msgstr "分散优先策略"
+
+#: ../../source/user_guide/launch.rst:60
+msgid "**Strategy Name:** `spread_first`"
+msgstr "**策略名称:** `spread_first`"
+
+#: ../../source/user_guide/launch.rst:62
+msgid ""
+"**Description:** Distributes replicas across available GPUs using round-"
+"robin. Promotes load balancing and even resource utilization. Still sorts"
+" by available memory to prioritize better GPUs."
+msgstr ""
+"**描述:** 在可用GPU间使用轮询分布副本。促进负载均衡和均匀资源利用。"
+"仍然按可用内存排序以优先选择更好的GPU。"
+
+#: ../../source/user_guide/launch.rst:66
+msgid "Quota-Aware Strategy"
+msgstr "配额感知策略"
+
+#: ../../source/user_guide/launch.rst:68
+msgid "**Strategy Name:** `quota_aware`"
+msgstr "**策略名称:** `quota_aware`"
+
+#: ../../source/user_guide/launch.rst:70
+msgid ""
+"**Description:** Restricts allocation to a specified GPU quota, then "
+"applies spread-first distribution within that quota. Ideal for multi-"
+"tenant resource isolation and cost allocation."
+msgstr ""
+"**描述:** 将分配限制在指定的GPU配额内，然后在配额内应用分散优先分发。"
+"适合多租户资源隔离和成本分配。"
+
+#: ../../source/user_guide/launch.rst:74
 msgid "Set Environment Variables"
 msgstr "设置环境变量"
 
-#: ../../source/user_guide/launch.rst:22
+#: ../../source/user_guide/launch.rst:78
 msgid ""
 "Sometimes, we want to specify environment variables for a particular "
 "model at runtime. Since v1.8.1, Xinference provides the capability to "
@@ -60,21 +164,21 @@ msgstr ""
 "有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始，Xinference "
 "提供了单独配置环境变量的功能，无需在启动 Xinference 前设置。"
 
-#: ../../source/user_guide/launch.rst:25
+#: ../../source/user_guide/launch.rst:81
 msgid "For Web UI."
 msgstr "针对 Web UI。"
 
-#: ../../source/user_guide/launch.rst:31
+#: ../../source/user_guide/launch.rst:87
 msgid ""
 "When using the command line, use ``--env`` to specify an environment "
 "variable."
 msgstr "命令行使用时，使用 ``--env`` 指定环境变量。"
 
-#: ../../source/user_guide/launch.rst:33
+#: ../../source/user_guide/launch.rst:89
 msgid "Example usage:"
 msgstr "示例用法："
 
-#: ../../source/user_guide/launch.rst:39
+#: ../../source/user_guide/launch.rst:95
 msgid ""
 "Take vLLM as an example: it has versions V1 and V0, and by default, it "
 "automatically determines which version to use. If you want to force the "
@@ -85,13 +189,53 @@ msgstr ""
 "在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0，可以指定该环境变量"
 "。"
 
-#: ../../source/user_guide/launch.rst:43
+#: ../../source/user_guide/launch.rst:99
 msgid "Configuring Model Virtual Environment"
 msgstr "配置模型虚拟空间"
 
-#: ../../source/user_guide/launch.rst:47
+#: ../../source/user_guide/launch.rst:103
 msgid ""
 "For this part, please refer to :ref:`toggling virtual environments and "
 "customizing dependencies <model_launching_virtualenv>`."
-msgstr "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_virtualenv>`。"
+msgstr ""
+"对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
+"virtualenv>`。"
+
+#~ msgid "New Feature: Smart Replica Deployment"
+#~ msgstr "新功能：智能副本部署"
+
+#~ msgid "Single GPU Multi-Replica"
+#~ msgstr "单GPU多副本"
+
+#~ msgid "New Support: Run multiple model replicas even with just one GPU."
+#~ msgstr "新增支持：即使仅有一块GPU，也能运行多个模型副本。"
+
+#~ msgid "Scenario: You have 1 GPU with sufficient VRAM"
+#~ msgstr "场景：您拥有1个GPU且显存充足"
+
+#~ msgid "Configuration: Replica Count = 3, GPU Count = 1"
+#~ msgstr "配置：副本数量=3，GPU数量=1"
+
+#~ msgid ""
+#~ "Result: 3 model instances running on "
+#~ "the same GPU, sharing GPU resources"
+#~ msgstr "结果：3个模型实例，在同一GPU上运行，共享GPU资源"
+
+#~ msgid "Hybrid GPU Allocation"
+#~ msgstr "混合GPU分配"
+
+#~ msgid ""
+#~ "Smart Allocation: Number of replicas may"
+#~ " differ from GPU count; system "
+#~ "intelligently distributes"
+#~ msgstr "智能分配: 副本数可以不等于GPU数量，系统会智能分配"
+
+#~ msgid "Scenario: You have 2 GPUs and need 3 replicas"
+#~ msgstr "场景: 你有2张GPU，需要3个副本"
+
+#~ msgid "Configuration: Replicas=3, GPUs=2"
+#~ msgstr "配置: 副本数=3，GPU数量=2"
+
+#~ msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance"
+#~ msgstr "结果: GPU0运行2个实例，GPU1运行1个实例"
 
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
@@ -14,6 +14,62 @@ you can set the replica count to 2. This way, two identical instances of the mod
 Xinference automatically load-balances requests to ensure even distribution across multiple GPUs.
 Meanwhile, users see it as a single model, which greatly improves overall resource utilization.
 
+Traditional Multi-Instance Deployment：
+
+When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:
+
+- 2 GPUs, 2 instances: Each GPU runs one model instance
+- 4 GPUs, 4 instances: Each GPU runs one model instance
+
+.. versionadded:: v1.14.0
+
+Introduce a new environment variable:
+
+.. code-block:: bash
+
+    XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+Control whether to enable the single GPU multi-copy feature
+Default value: 1
+
+Key Features: Four new launch strategies
+~~~~~~~~~~
+
+Memory-Aware Strategy
+^^^^^^^^
+
+**Strategy Name:** `memory_aware` (default)
+
+**Description:**
+Advanced memory-aware allocation with intelligent GPU reuse.
+Estimates model memory requirements using formula: `model_size(GB) × 1024 × 1.5`,
+adjusts for quantization, and checks actual GPU availability before allocation.
+
+Packing-First Strategy
+^^^^^^^^
+
+**Strategy Name:** `packing_first`
+
+**Description:**
+Prioritizes filling GPUs with the most available memory before moving to the next.
+Optimizes for GPU utilization by consolidating instances.
+
+Spread-First Strategy
+^^^^^^^^
+
+**Strategy Name:** `spread_first`
+
+**Description:**
+Distributes replicas across available GPUs using round-robin. Promotes load balancing and even resource utilization. Still sorts by available memory to prioritize better GPUs.
+
+Quota-Aware Strategy
+^^^^^^^^
+
+**Strategy Name:** `quota_aware`
+
+**Description:**
+Restricts allocation to a specified GPU quota, then applies spread-first distribution within that quota. Ideal for multi-tenant resource isolation and cost allocation.
+
 Set Environment Variables
 =========================
 

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
@@ -1259,11 +1259,22 @@ async def launch_model(
 
         if isinstance(gpu_idx, int):
             gpu_idx = [gpu_idx]
-        if gpu_idx:
-            if len(gpu_idx) % replica:
+
+        # Check if single-GPU multi-replica is enabled
+        from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+        if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA:
+            # Allow single- or multi-GPU reuse for replicas
+            if gpu_idx and replica > 1:
+                logger.info(
+                    f"Multi-replica deployment enabled: {replica} replicas across GPUs {gpu_idx}"
+                )
+        else:
+            # Traditional behavior - strict multiple requirement
+            if gpu_idx and len(gpu_idx) % replica:
                 raise HTTPException(
                     status_code=400,
-                    detail="Invalid input. Allocated gpu must be a multiple of replica.",
+                    detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.",
                 )
 
         if peft_model_config is not None:

diff --git a/xinference/constants.py b/xinference/constants.py
@@ -34,6 +34,11 @@
 XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
 XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
 XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
+XINFERENCE_ENV_LAUNCH_STRATEGY = "XINFERENCE_LAUNCH_STRATEGY"
+XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS = "XINFERENCE_LAUNCH_ALLOWED_GPUS"
+XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
+    "XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
+)
 XINFERENCE_ENV_BATCH_SIZE = "XINFERENCE_BATCH_SIZE"
 XINFERENCE_ENV_BATCH_INTERVAL = "XINFERENCE_BATCH_INTERVAL"
 
@@ -114,5 +119,15 @@ def get_xinference_home() -> str:
     else None
 )
 XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS)
+XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
+    int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
+)  # Enable by default
+XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "memory_aware")
+_allowed_gpu_str = os.getenv(XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS, "")
+XINFERENCE_LAUNCH_ALLOWED_GPUS = (
+    {int(x) for x in _allowed_gpu_str.split(",") if x.strip().isdigit()}
+    if _allowed_gpu_str
+    else None
+)
 XINFERENCE_BATCH_SIZE = int(os.getenv(XINFERENCE_ENV_BATCH_SIZE, "32"))
 XINFERENCE_BATCH_INTERVAL = float(os.getenv(XINFERENCE_ENV_BATCH_INTERVAL, "0.003"))