Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 155 additions & 11 deletions doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-08-02 23:15+0800\n"
"POT-Creation-Date: 2025-11-21 18:03+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
Expand All @@ -17,7 +17,7 @@ msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.14.0\n"
"Generated-By: Babel 2.17.0\n"

#: ../../source/user_guide/launch.rst:5
msgid "Model Launching Instructions"
Expand Down Expand Up @@ -46,11 +46,115 @@ msgstr ""
"两张 GPU 上。Xinference 会自动进行负载均衡,确保请求均匀分配到多张卡上。"
"用户看到的仍是一个模型,这大大提升了整体资源利用率。"

#: ../../source/user_guide/launch.rst:18
#: ../../source/user_guide/launch.rst:17
msgid "Traditional Multi-Instance Deployment:"
msgstr "旧版本多实例部署:"

#: ../../source/user_guide/launch.rst:19
msgid ""
"When you have multiple GPU cards, each capable of hosting one model "
"instance, you can set the number of instances equal to the number of "
"GPUs. For example:"
msgstr ""
"当您拥有多张GPU显卡时,每张显卡可承载一个模型实例,此时可将实例数量设置为"
"等于GPU数量。例如:"

#: ../../source/user_guide/launch.rst:21
msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
msgstr "2张GPU,2个实例:每张GPU运行一个模型实例"

#: ../../source/user_guide/launch.rst:22
msgid "4 GPUs, 4 instances: Each GPU runs one model instance"
msgstr "4张GPU,4个实例:每张GPU运行一个模型实例"

#: ../../source/user_guide/launch.rst:26
msgid "Introduce a new environment variable:"
msgstr "引入一个新的环境变量:"

#: ../../source/user_guide/launch.rst:32
msgid ""
"Control whether to enable the single GPU multi-copy feature Default "
"value: 1"
msgstr "控制是否启用单GPU多副本功能,默认值:1"

#: ../../source/user_guide/launch.rst:36
msgid "Key Features: Four new launch strategies"
msgstr "关键特性:四个新的启动策略"

#: ../../source/user_guide/launch.rst:39
msgid "Memory-Aware Strategy"
msgstr "内存感知策略"

#: ../../source/user_guide/launch.rst:41
msgid "**Strategy Name:** `memory_aware` (default)"
msgstr "**策略名称:** `memory_aware` (默认)"

#: ../../source/user_guide/launch.rst:43
msgid ""
"**Description:** Advanced memory-aware allocation with intelligent GPU "
"reuse. Estimates model memory requirements using formula: `model_size(GB)"
" × 1024 × 1.5`, adjusts for quantization, and checks actual GPU "
"availability before allocation."
msgstr ""
"**描述:** 先进的内存感知分配,支持智能GPU复用。"
"使用公式 `模型大小(GB) × 1024 × 1.5` 估算模型内存需求,"
"根据量化级别调整,并在分配前检查实际GPU可用性。"

#: ../../source/user_guide/launch.rst:49
msgid "Packing-First Strategy"
msgstr "打包优先策略"

#: ../../source/user_guide/launch.rst:51
msgid "**Strategy Name:** `packing_first`"
msgstr "**策略名称:** `packing_first`"

#: ../../source/user_guide/launch.rst:53
msgid ""
"**Description:** Prioritizes filling GPUs with the most available memory "
"before moving to the next. Optimizes for GPU utilization by consolidating"
" instances."
msgstr ""
"**描述:** 优先填满可用内存最多的GPU,然后再使用下一个。通过整合实例来优化GPU利用率。"

#: ../../source/user_guide/launch.rst:58
msgid "Spread-First Strategy"
msgstr "分散优先策略"

#: ../../source/user_guide/launch.rst:60
msgid "**Strategy Name:** `spread_first`"
msgstr "**策略名称:** `spread_first`"

#: ../../source/user_guide/launch.rst:62
msgid ""
"**Description:** Distributes replicas across available GPUs using round-"
"robin. Promotes load balancing and even resource utilization. Still sorts"
" by available memory to prioritize better GPUs."
msgstr ""
"**描述:** 在可用GPU间使用轮询分布副本。促进负载均衡和均匀资源利用。"
"仍然按可用内存排序以优先选择更好的GPU。"

#: ../../source/user_guide/launch.rst:66
msgid "Quota-Aware Strategy"
msgstr "配额感知策略"

#: ../../source/user_guide/launch.rst:68
msgid "**Strategy Name:** `quota_aware`"
msgstr "**策略名称:** `quota_aware`"

#: ../../source/user_guide/launch.rst:70
msgid ""
"**Description:** Restricts allocation to a specified GPU quota, then "
"applies spread-first distribution within that quota. Ideal for multi-"
"tenant resource isolation and cost allocation."
msgstr ""
"**描述:** 将分配限制在指定的GPU配额内,然后在配额内应用分散优先分发。"
"适合多租户资源隔离和成本分配。"

#: ../../source/user_guide/launch.rst:74
msgid "Set Environment Variables"
msgstr "设置环境变量"

#: ../../source/user_guide/launch.rst:22
#: ../../source/user_guide/launch.rst:78
msgid ""
"Sometimes, we want to specify environment variables for a particular "
"model at runtime. Since v1.8.1, Xinference provides the capability to "
Expand All @@ -60,21 +164,21 @@ msgstr ""
"有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始,Xinference "
"提供了单独配置环境变量的功能,无需在启动 Xinference 前设置。"

#: ../../source/user_guide/launch.rst:25
#: ../../source/user_guide/launch.rst:81
msgid "For Web UI."
msgstr "针对 Web UI。"

#: ../../source/user_guide/launch.rst:31
#: ../../source/user_guide/launch.rst:87
msgid ""
"When using the command line, use ``--env`` to specify an environment "
"variable."
msgstr "命令行使用时,使用 ``--env`` 指定环境变量。"

#: ../../source/user_guide/launch.rst:33
#: ../../source/user_guide/launch.rst:89
msgid "Example usage:"
msgstr "示例用法:"

#: ../../source/user_guide/launch.rst:39
#: ../../source/user_guide/launch.rst:95
msgid ""
"Take vLLM as an example: it has versions V1 and V0, and by default, it "
"automatically determines which version to use. If you want to force the "
Expand All @@ -85,13 +189,53 @@ msgstr ""
"在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0,可以指定该环境变量"
"。"

#: ../../source/user_guide/launch.rst:43
#: ../../source/user_guide/launch.rst:99
msgid "Configuring Model Virtual Environment"
msgstr "配置模型虚拟空间"

#: ../../source/user_guide/launch.rst:47
#: ../../source/user_guide/launch.rst:103
msgid ""
"For this part, please refer to :ref:`toggling virtual environments and "
"customizing dependencies <model_launching_virtualenv>`."
msgstr "对于这部分,请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_virtualenv>`。"
msgstr ""
"对于这部分,请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
"virtualenv>`。"

#~ msgid "New Feature: Smart Replica Deployment"
#~ msgstr "新功能:智能副本部署"

#~ msgid "Single GPU Multi-Replica"
#~ msgstr "单GPU多副本"

#~ msgid "New Support: Run multiple model replicas even with just one GPU."
#~ msgstr "新增支持:即使仅有一块GPU,也能运行多个模型副本。"

#~ msgid "Scenario: You have 1 GPU with sufficient VRAM"
#~ msgstr "场景:您拥有1个GPU且显存充足"

#~ msgid "Configuration: Replica Count = 3, GPU Count = 1"
#~ msgstr "配置:副本数量=3,GPU数量=1"

#~ msgid ""
#~ "Result: 3 model instances running on "
#~ "the same GPU, sharing GPU resources"
#~ msgstr "结果:3个模型实例,在同一GPU上运行,共享GPU资源"

#~ msgid "Hybrid GPU Allocation"
#~ msgstr "混合GPU分配"

#~ msgid ""
#~ "Smart Allocation: Number of replicas may"
#~ " differ from GPU count; system "
#~ "intelligently distributes"
#~ msgstr "智能分配: 副本数可以不等于GPU数量,系统会智能分配"

#~ msgid "Scenario: You have 2 GPUs and need 3 replicas"
#~ msgstr "场景: 你有2张GPU,需要3个副本"

#~ msgid "Configuration: Replicas=3, GPUs=2"
#~ msgstr "配置: 副本数=3,GPU数量=2"

#~ msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance"
#~ msgstr "结果: GPU0运行2个实例,GPU1运行1个实例"

56 changes: 56 additions & 0 deletions doc/source/user_guide/launch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,62 @@ you can set the replica count to 2. This way, two identical instances of the mod
Xinference automatically load-balances requests to ensure even distribution across multiple GPUs.
Meanwhile, users see it as a single model, which greatly improves overall resource utilization.

Traditional Multi-Instance Deployment:

When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:

- 2 GPUs, 2 instances: Each GPU runs one model instance
- 4 GPUs, 4 instances: Each GPU runs one model instance

.. versionadded:: v1.14.0

Introduce a new environment variable:

.. code-block:: bash

XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA

Control whether to enable the single GPU multi-copy feature
Default value: 1

Key Features: Four new launch strategies
~~~~~~~~~~

Memory-Aware Strategy
^^^^^^^^

**Strategy Name:** `memory_aware` (default)

**Description:**
Advanced memory-aware allocation with intelligent GPU reuse.
Estimates model memory requirements using formula: `model_size(GB) × 1024 × 1.5`,
adjusts for quantization, and checks actual GPU availability before allocation.

Packing-First Strategy
^^^^^^^^

**Strategy Name:** `packing_first`

**Description:**
Prioritizes filling GPUs with the most available memory before moving to the next.
Optimizes for GPU utilization by consolidating instances.

Spread-First Strategy
^^^^^^^^

**Strategy Name:** `spread_first`

**Description:**
Distributes replicas across available GPUs using round-robin. Promotes load balancing and even resource utilization. Still sorts by available memory to prioritize better GPUs.

Quota-Aware Strategy
^^^^^^^^

**Strategy Name:** `quota_aware`

**Description:**
Restricts allocation to a specified GPU quota, then applies spread-first distribution within that quota. Ideal for multi-tenant resource isolation and cost allocation.

Set Environment Variables
=========================

Expand Down
17 changes: 14 additions & 3 deletions xinference/api/restful_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,11 +1259,22 @@ async def launch_model(

if isinstance(gpu_idx, int):
gpu_idx = [gpu_idx]
if gpu_idx:
if len(gpu_idx) % replica:

# Check if single-GPU multi-replica is enabled
from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA

if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA:
# Allow single- or multi-GPU reuse for replicas
if gpu_idx and replica > 1:
logger.info(
f"Multi-replica deployment enabled: {replica} replicas across GPUs {gpu_idx}"
)
else:
# Traditional behavior - strict multiple requirement
if gpu_idx and len(gpu_idx) % replica:
raise HTTPException(
status_code=400,
detail="Invalid input. Allocated gpu must be a multiple of replica.",
detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.",
)

if peft_model_config is not None:
Expand Down
15 changes: 15 additions & 0 deletions xinference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
XINFERENCE_ENV_LAUNCH_STRATEGY = "XINFERENCE_LAUNCH_STRATEGY"
XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS = "XINFERENCE_LAUNCH_ALLOWED_GPUS"
XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
"XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
)
XINFERENCE_ENV_BATCH_SIZE = "XINFERENCE_BATCH_SIZE"
XINFERENCE_ENV_BATCH_INTERVAL = "XINFERENCE_BATCH_INTERVAL"

Expand Down Expand Up @@ -114,5 +119,15 @@ def get_xinference_home() -> str:
else None
)
XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS)
XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
) # Enable by default
XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "memory_aware")
_allowed_gpu_str = os.getenv(XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS, "")
XINFERENCE_LAUNCH_ALLOWED_GPUS = (
{int(x) for x in _allowed_gpu_str.split(",") if x.strip().isdigit()}
if _allowed_gpu_str
else None
)
XINFERENCE_BATCH_SIZE = int(os.getenv(XINFERENCE_ENV_BATCH_SIZE, "32"))
XINFERENCE_BATCH_INTERVAL = float(os.getenv(XINFERENCE_ENV_BATCH_INTERVAL, "0.003"))
Loading
Loading