diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md index e92cb663c4f..0477c274551 100644 --- a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md +++ b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md @@ -102,239 +102,6 @@ make install We can run the following scripts to launch a server on the prefiller/decoder node, respectively. Please note that each P/D node will occupy ports ranging from kv_port to kv_port + num_chips to initialize socket listeners. To avoid any issues, port conflicts should be prevented. Additionally, ensure that each node's engine_id is uniquely assigned to avoid conflicts. -### Layerwise - -:::::{tab-set} - -::::{tab-item} Prefiller node 1 - -```shell -unset ftp_proxy -unset https_proxy -unset http_proxy -export HCCL_IF_IP=192.0.0.1 -export GLOO_SOCKET_IFNAME="eth0" # network card name -export TP_SOCKET_IFNAME="eth0" -export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 -export HCCL_BUFFSIZE=1024 -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH - -vllm serve /model/Qwen3-235B-A22B-W8A8 \ - --host 0.0.0.0 \ - --port 8004 \ - --api-server-count 1 \ - --data-parallel-size 2 \ - --data-parallel-size-local 2 \ - --data-parallel-address 192.0.0.1 \ - --data-parallel-rpc-port 13389 \ - --tensor-parallel-size 8 \ - --enable-expert-parallel \ - --seed 1024 \ - --enforce-eager \ - --distributed-executor-backend mp \ - --served-model-name qwen3-moe \ - --max-model-len 32768 \ - --max-num-batched-tokens 32768 \ - --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --kv-transfer-config \ - '{"kv_connector": "MooncakeLayerwiseConnector", - "kv_role": "kv_producer", - "kv_port": "30000", - "engine_id": "0", - "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector", - "kv_connector_extra_config": { - "prefill": { - "dp_size": 2, - "tp_size": 8 - }, - "decode": { - "dp_size": 32, - "tp_size": 1 - } - } - }' -``` - -:::: - -::::{tab-item} Prefiller node 2 - -```shell -unset ftp_proxy -unset https_proxy -unset http_proxy -export HCCL_IF_IP=192.0.0.2 -export GLOO_SOCKET_IFNAME="eth0" # network card name -export TP_SOCKET_IFNAME="eth0" -export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 -export HCCL_BUFFSIZE=1024 -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH - -vllm serve /model/Qwen3-235B-A22B-W8A8 \ - --host 0.0.0.0 \ - --port 8004 \ - --api-server-count 1 \ - --data-parallel-size 2 \ - --data-parallel-size-local 2 \ - --data-parallel-address 192.0.0.2 \ - --data-parallel-rpc-port 13389 \ - --tensor-parallel-size 8 \ - --enable-expert-parallel \ - --seed 1024 \ - --enforce-eager \ - --distributed-executor-backend mp \ - --served-model-name qwen3-moe \ - --max-model-len 32768 \ - --max-num-batched-tokens 32768 \ - --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --kv-transfer-config \ - '{"kv_connector": "MooncakeLayerwiseConnector", - "kv_role": "kv_producer", - "kv_port": "30100", - "engine_id": "1", - "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector", - "kv_connector_extra_config": { - "prefill": { - "dp_size": 2, - "tp_size": 8 - }, - "decode": { - "dp_size": 32, - "tp_size": 1 - } - } - }' -``` - -:::: - -::::{tab-item} Decoder node 1 (master node) - -```shell -unset ftp_proxy -unset https_proxy -unset http_proxy -export HCCL_IF_IP=192.0.0.3 -export GLOO_SOCKET_IFNAME="eth0" # network card name -export TP_SOCKET_IFNAME="eth0" -export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 -export HCCL_BUFFSIZE=2048 -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH - -vllm serve /model/Qwen3-235B-A22B-W8A8 \ - --host 0.0.0.0 \ - --port 8004 \ - --api-server-count 1 \ - --data-parallel-size 32 \ - --data-parallel-size-local 16 \ - --data-parallel-address 192.0.0.3 \ - --data-parallel-rpc-port 5964 \ - --tensor-parallel-size 1 \ - --enable-expert-parallel \ - --seed 1024 \ - --distributed-executor-backend mp \ - --served-model-name qwen3-moe \ - --max-model-len 32768 \ - --max-num-batched-tokens 512 \ - --max-num_seqs 16 \ - --trust-remote-code \ - --no-enable-prefix-caching \ - --gpu-memory-utilization 0.9 \ - --compilation-config '{"cudagraph_capture_sizes":[16]}' \ - --kv-transfer-config \ - '{"kv_connector": "MooncakeLayerwiseConnector", - "kv_role": "kv_consumer", - "kv_port": "30200", - "engine_id": "2", - "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector", - "kv_connector_extra_config": { - "prefill": { - "dp_size": 2, - "tp_size": 8 - }, - "decode": { - "dp_size": 32, - "tp_size": 1 - } - } - }' -``` - -:::: - -::::{tab-item} Decoder node 2 (primary node) - -```shell -unset ftp_proxy -unset https_proxy -unset http_proxy -export HCCL_IF_IP=192.0.0.4 -export GLOO_SOCKET_IFNAME="eth0" # network card name -export TP_SOCKET_IFNAME="eth0" -export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 -export HCCL_BUFFSIZE=2048 -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH - -vllm serve /model/Qwen3-235B-A22B-W8A8 \ - --host 0.0.0.0 \ - --port 8004 \ - --headless \ - --data-parallel-size 32 \ - --data-parallel-size-local 16 \ - --data-parallel-start-rank 16 \ - --data-parallel-address 192.0.0.3 \ - --data-parallel-rpc-port 5964 \ - --tensor-parallel-size 1 \ - --enable-expert-parallel \ - --seed 1024 \ - --distributed-executor-backend mp \ - --served-model-name qwen3-moe \ - --max-model-len 32768 \ - --max-num-batched-tokens 512 \ - --max-num_seqs 16 \ - --trust-remote-code \ - --no-enable-prefix-caching \ - --gpu-memory-utilization 0.9 \ - --compilation-config '{"cudagraph_capture_sizes":[16]}' \ - --kv-transfer-config \ - '{"kv_connector": "MooncakeLayerwiseConnector", - "kv_role": "kv_consumer", - "kv_port": "30200", - "engine_id": "2", - "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector", - "kv_connector_extra_config": { - "prefill": { - "dp_size": 2, - "tp_size": 8 - }, - "decode": { - "dp_size": 32, - "tp_size": 1 - } - } - }' -``` - -:::: - -::::: - -### Non-layerwise - :::::{tab-set} ::::{tab-item} Prefiller node 1 @@ -566,25 +333,7 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \ ## Example Proxy for Deployment -Run a proxy server on the same node with the prefiller service instance. You can get the proxy program in the repository's examples: [load\_balance\_proxy\_layerwise\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py) or [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py) - -:::::{tab-set} - -::::{tab-item} Layerwise - -```shell -python load_balance_proxy_layerwise_server_example.py \ - --host 192.0.0.1 \ - --port 8080 \ - --prefiller-hosts 192.0.0.1 192.0.0.2\ - --prefiller-port 8004 8004\ - --decoder-hosts 192.0.0.3\ - --decoder-ports 8004 -``` - -:::: - -::::{tab-item} Non-layerwise +Run a proxy server on the same node with the prefiller service instance. You can get the proxy program in the repository's examples: [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py) ```shell python load_balance_proxy_server_example.py \ @@ -596,10 +345,6 @@ python load_balance_proxy_server_example.py \ --decoder-ports 8004 ``` -:::: - -::::: - ## Verification Check service health using the proxy server endpoint. diff --git a/vllm_ascend/distributed/__init__.py b/vllm_ascend/distributed/__init__.py index 0915b38a519..697f1c14615 100644 --- a/vllm_ascend/distributed/__init__.py +++ b/vllm_ascend/distributed/__init__.py @@ -33,8 +33,3 @@ def register_connector(): "MooncakeConnectorStoreV1", "vllm_ascend.distributed.mooncake.mooncake_store_connector_v1", "MooncakeConnectorV1") - - KVConnectorFactory.register_connector( - "MooncakeLayerwiseConnector", - "vllm_ascend.distributed.mooncake_layerwise_connector", - "MooncakeLayerwiseConnector")