Skip to content

Commit 1533349

Browse files
committed
add layerwise CI for Qwen3-235B-w8a8 and DEEPSEEK
Signed-off-by: Fager10086 <[email protected]>
1 parent d252e36 commit 1533349

File tree

3 files changed

+205
-0
lines changed

3 files changed

+205
-0
lines changed

.github/workflows/vllm_ascend_test_nightly_a3.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ jobs:
5252
- name: multi-node-deepseek-pd
5353
config_file_path: DeepSeek-V3.yaml
5454
size: 2
55+
- name: multi-node-deepseek-pd
56+
config_file_path: DeepSeek-V3-layerwise.yaml
57+
size: 2
5558
- name: multi-node-qwen3-dp
5659
config_file_path: Qwen3-235B-A3B.yaml
5760
size: 2
@@ -61,6 +64,9 @@ jobs:
6164
- name: multi-node-qwenw8a8-2node
6265
config_file_path: Qwen3-235B-W8A8.yaml
6366
size: 2
67+
- name: multi-node-qwenw8a8-2node
68+
config_file_path: Qwen3-235B-W8A8-layerwise.yaml
69+
size: 2
6470
- name: multi-node-glm-2node
6571
config_file_path: GLM-4_5.yaml
6672
size: 2
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# For disaggregated mode, set is_disaggregated: true, and set the following parameters:
2+
# Prefiller_index: the hosts index of the node running prefiller
3+
# Decoder_index: the hosts index of the node running decoder
4+
# Suppose we have **4 nodes** running a 2P1D setup (2 Prefillers + 1 Decoder):
5+
# ┌───────────────┬───────────────┬───────────────┬───────────────┐
6+
# │ node0 │ node1 │ node2 │ node3 │
7+
# │ Prefiller #1 │ Prefiller #2 │ Decoder │ Decoder │
8+
# └───────────────┴───────────────┴───────────────┴───────────────┘
9+
# For the prefiller nodes. the hosts should be node0 and node1
10+
# For the decoder nodes. we only have 1 decoder node(dp+tp+ep across node2 and node3. Where node3 is running with headless mode)
11+
# So the prefiller_host_index is [0, 1], and the decoder_host_index is [2]
12+
test_name: "test DeepSeek-V3 disaggregated_prefill"
13+
model: "vllm-ascend/DeepSeek-V3-W8A8"
14+
num_nodes: 2
15+
npu_per_node: 16
16+
env_common:
17+
VLLM_USE_MODELSCOPE: true
18+
OMP_PROC_BIND: false
19+
OMP_NUM_THREADS: 100
20+
HCCL_BUFFSIZE: 1024
21+
SERVER_PORT: 8080
22+
NUMEXPR_MAX_THREADS: 128
23+
DISAGGREGATED_PREFILL_PROXY_SCRIPT: "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
24+
# For None kubernetes deployment, list the IPs of all nodes used in order as follow
25+
# cluster_hosts: []
26+
disaggregated_prefill:
27+
enabled: true
28+
prefiller_host_index: [0]
29+
decoder_host_index: [1]
30+
31+
deployment:
32+
-
33+
server_cmd: >
34+
vllm serve "vllm-ascend/DeepSeek-V3-W8A8"
35+
--host 0.0.0.0
36+
--port $SERVER_PORT
37+
--data-parallel-size 2
38+
--data-parallel-size-local 2
39+
--tensor-parallel-size 8
40+
--seed 1024
41+
--enforce-eager
42+
--enable-expert-parallel
43+
--max-num-seqs 16
44+
--max-model-len 8192
45+
--max-num-batched-tokens 8192
46+
--quantization ascend
47+
--trust-remote-code
48+
--no-enable-prefix-caching
49+
--gpu-memory-utilization 0.9
50+
--kv-transfer-config
51+
'{"kv_connector": "MooncakeLayerwiseConnector",
52+
"kv_role": "kv_producer",
53+
"kv_port": "30000",
54+
"engine_id": "0",
55+
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
56+
"kv_connector_extra_config": {
57+
"prefill": {
58+
"dp_size": 2,
59+
"tp_size": 8
60+
},
61+
"decode": {
62+
"dp_size": 2,
63+
"tp_size": 8
64+
}
65+
}
66+
}'
67+
68+
-
69+
server_cmd: >
70+
vllm serve "vllm-ascend/DeepSeek-V3-W8A8"
71+
--host 0.0.0.0
72+
--port $SERVER_PORT
73+
--data-parallel-size 2
74+
--data-parallel-size-local 2
75+
--tensor-parallel-size 8
76+
--seed 1024
77+
--quantization ascend
78+
--max-num-seqs 16
79+
--max-model-len 8192
80+
--max-num-batched-tokens 8192
81+
--enable-expert-parallel
82+
--trust-remote-code
83+
--no-enable-prefix-caching
84+
--gpu-memory-utilization 0.9
85+
--additional-config '{"torchair_graph_config":{"enabled":true}}'
86+
--kv-transfer-config
87+
'{"kv_connector": "MooncakeLayerwiseConnector",
88+
"kv_role": "kv_consumer",
89+
"kv_port": "30200",
90+
"engine_id": "1",
91+
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
92+
"kv_connector_extra_config": {
93+
"prefill": {
94+
"dp_size": 2,
95+
"tp_size": 8
96+
},
97+
"decode": {
98+
"dp_size": 2,
99+
"tp_size": 8
100+
}
101+
}
102+
}'
103+
benchmarks:
104+
acc:
105+
case_type: accuracy
106+
dataset_path: vllm-ascend/gsm8k-lite
107+
request_conf: vllm_api_general_chat
108+
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
109+
max_out_len: 4096
110+
batch_size: 512
111+
baseline: 95
112+
threshold: 5
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
test_name: "test Qwen3-235B-A22B-W8A8 disaggregated_prefill"
2+
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
3+
num_nodes: 2
4+
npu_per_node: 16
5+
env_common:
6+
VLLM_USE_MODELSCOPE: true
7+
OMP_PROC_BIND: false
8+
OMP_NUM_THREADS: 100
9+
HCCL_BUFFSIZE: 1024
10+
SERVER_PORT: 8080
11+
NUMEXPR_MAX_THREADS: 128
12+
disaggregated_prefill:
13+
enabled: true
14+
prefiller_host_index: [0]
15+
decoder_host_index: [1]
16+
17+
deployment:
18+
-
19+
server_cmd: >
20+
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
21+
--host 0.0.0.0
22+
--port $SERVER_PORT
23+
--data-parallel-size 2
24+
--data-parallel-size-local 2
25+
--tensor-parallel-size 8
26+
--seed 1024
27+
--enable-expert-parallel
28+
--max-num-seqs 16
29+
--max-model-len 8192
30+
--max-num-batched-tokens 8192
31+
--quantization ascend
32+
--trust-remote-code
33+
--no-enable-prefix-caching
34+
--gpu-memory-utilization 0.9
35+
--kv-transfer-config
36+
'{"kv_connector": "MooncakeLayerwiseConnector",
37+
"kv_role": "kv_producer",
38+
"kv_port": "30000",
39+
"engine_id": "0",
40+
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
41+
"kv_connector_extra_config": {
42+
"prefill": {
43+
"dp_size": 2,
44+
"tp_size": 8
45+
},
46+
"decode": {
47+
"dp_size": 2,
48+
"tp_size": 8
49+
}
50+
}
51+
}'
52+
53+
-
54+
server_cmd: >
55+
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
56+
--host 0.0.0.0
57+
--port $SERVER_PORT
58+
--data-parallel-size 2
59+
--data-parallel-size-local 2
60+
--tensor-parallel-size 8
61+
--seed 1024
62+
--quantization ascend
63+
--max-num-seqs 16
64+
--max-model-len 8192
65+
--max-num-batched-tokens 8192
66+
--enable-expert-parallel
67+
--trust-remote-code
68+
--no-enable-prefix-caching
69+
--gpu-memory-utilization 0.9
70+
--kv-transfer-config
71+
'{"kv_connector": "MooncakeLayerwiseConnector",
72+
"kv_role": "kv_consumer",
73+
"kv_port": "30200",
74+
"engine_id": "1",
75+
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
76+
"kv_connector_extra_config": {
77+
"prefill": {
78+
"dp_size": 2,
79+
"tp_size": 8
80+
},
81+
"decode": {
82+
"dp_size": 2,
83+
"tp_size": 8
84+
}
85+
}
86+
}'
87+
benchmarks:

0 commit comments

Comments
 (0)