Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class AscendConfig:

def __init__(self, vllm_config):
additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
self.mix_placement = additional_config.get("mix_placement",False)
torchair_graph_config = additional_config.get("torchair_graph_config",
{})
self.torchair_graph_config = TorchairGraphConfig(
Expand Down
22 changes: 18 additions & 4 deletions vllm_ascend/eplb/adaptor/vllm_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,20 +194,34 @@ def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str):
json.dump(record, f, indent=4)

def do_update_expert_map(self, layer_id, updated_expert_map):
self.expert_map_per_layer[layer_id] = updated_expert_map.clone()
self.expert_map_per_layer_cpu[layer_id] = updated_expert_map.clone()
pad_len = self.expert_map_per_layer[layer_id].shape[0] - updated_expert_map.shape[0]
updated_expert_map_padded = torch.nn.functional.pad(
updated_expert_map,
pad=(0,pad_len),
mode='constant',
value=-1
)
self.expert_map_per_layer[layer_id].copy_(updated_expert_map_padded)
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The in-place copy self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) will raise a RuntimeError if updated_expert_map has a different shape than self.expert_map_per_layer_cpu[layer_id]. The logic for padding updated_expert_map for the device tensor self.expert_map_per_layer suggests that shape mismatches are expected. The CPU-side map should be handled in a way that accommodates shape changes to avoid crashes. Reassigning the tensor, as was done previously, is a safer approach.

Suggested change
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)
self.expert_map_per_layer_cpu[layer_id] = updated_expert_map.clone()


def do_update_expert_weight(self, layer_id, local_expert_to_replace,
buffer_tensor_id):
for expert_tensor, buffer_tensor in zip(
self.expert_param_per_layer[layer_id][local_expert_to_replace],
self.buffer_tensor_list[buffer_tensor_id]):
expert_tensor = buffer_tensor.clone()
expert_tensor.copy_(buffer_tensor)
logger.debug(f"Expert tensor shape is :{expert_tensor.shape}")

def do_update_log2phy_map(self, layer_id, updated_log2phy_map):
if self.log2phy_map_per_layer[layer_id] is not None:
self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map)
pad_len = self.log2phy_map_per_layer[layer_id].shape[0] - updated_log2phy_map.shape[0]
updated_log2phy_map_padded = torch.nn.functional.pad(
updated_log2phy_map,
pad=(0,pad_len),
mode='constant',
value=-1
)
self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map_padded)

def global2local(self, placement: torch.Tensor,
E_local: int) -> torch.Tensor:
Expand Down
4 changes: 0 additions & 4 deletions vllm_ascend/eplb/core/eplb_device_transfer_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ def generate_expert_d2d_transfer_task(self, expert_send_info,
)
return

# If neither send nor receive task is needed for this layer on this rank, return
if not (expert_send_info or expert_recv_info):
return

self.updated_expert_map = updated_expert_map

self.layer_id = layer_id
Expand Down
15 changes: 15 additions & 0 deletions vllm_ascend/ops/fused_moe/experts_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def select_experts(hidden_states: torch.Tensor,
routed_scaling_factor=1.0,
e_score_correction_bias: Optional[torch.Tensor] = None,
indices_type: Optional[torch.dtype] = None,
mix_placement: Optional[bool] = False,
num_logical_experts: int = -1,
global_num_experts: int = -1):
"""
Fused experts with select experts.
Expand Down Expand Up @@ -87,6 +89,19 @@ def select_experts(hidden_states: torch.Tensor,
e_score_correction_bias=e_score_correction_bias,
global_num_experts=global_num_experts,
)
if mix_placement:
pad_shared_expert_ids = torch.full((topk_ids.shape[0], 1),
num_logical_experts,
dtype=topk_ids.dtype,
device=topk_ids.device)

pad_shared_expert_weights = torch.full((topk_weights.shape[0], 1),
0.4,
dtype=topk_weights.dtype,
device=topk_weights.device)
Comment on lines +98 to +101
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The value 0.4 is used as a hardcoded weight for the padded shared expert. This "magic number" makes the code harder to understand and maintain. It's unclear why this specific value is chosen and what its implications are, especially concerning weight normalization which typically expects weights to sum to 1. This value should be defined as a named constant with an explanatory comment, or passed as a parameter to the function to improve clarity and maintainability.

topk_ids = torch.cat([topk_ids, pad_shared_expert_ids], dim=1)
topk_weights = torch.cat([topk_weights, pad_shared_expert_weights],
dim=1)
return topk_weights, topk_ids


Expand Down
46 changes: 30 additions & 16 deletions vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,10 @@ def __init__(self, *args, **kwargs):
self.moe_config.dp_group = get_dp_group()
self.moe_config.ep_group = get_ep_group()
self.moe_config.mc2_group = get_mc2_group()
ascend_config = get_ascend_config()
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
self.expert_map_path = ascend_config.expert_map_path
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
self.ascend_config = get_ascend_config()
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
self.expert_map_path = self.ascend_config.expert_map_path
self.global_redundant_expert_num = self.ascend_config.init_redundancy_expert
self.global_num_experts = num_experts + self.global_redundant_expert_num
if self.custom_routing_function is None and self.e_score_correction_bias is not None:
vllm_config = get_current_vllm_config()
Expand Down Expand Up @@ -248,7 +248,7 @@ def __init__(self, *args, **kwargs):
moe_quant_params["intermediate_size_full"] = intermediate_size
self.quant_method.create_weights(layer=self, **moe_quant_params)

self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp

setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type()
Expand All @@ -275,7 +275,7 @@ def get_map(self):
return self.expert_map

def get_log2phy_map(self):
return self.logical_to_physical_map
return self.log2phy

def clear_moe_load(self):
if self.moe_load is not None:
Expand Down Expand Up @@ -428,8 +428,8 @@ def __init__(
self._shared_experts = shared_experts
self.use_overlapped = use_overlapped
self.shared_expert_stream = None
ascend_config = get_ascend_config()
self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert
self.ascend_config = get_ascend_config()
self.multistream_overlap_shared_expert = self.ascend_config.multistream_overlap_shared_expert
if enable_sp():
logger.info_once(
"Sequence parallelism is enabled, shared experts are replicated for best performance."
Expand Down Expand Up @@ -457,11 +457,19 @@ def forward(
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
shared_out, fused_out = AscendFusedMoE.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
if self._shared_experts is None:
fused_out = AscendFusedMoE.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
shared_out = None
else:
shared_out, fused_out = AscendFusedMoE.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
return shared_out, fused_out

def forward_impl(self, hidden_states: torch.Tensor,
Expand All @@ -475,7 +483,10 @@ def forward_impl(self, hidden_states: torch.Tensor,
# Use a separate stream to run shared experts.
# Note that currently we only support calculations in separate streams with aclgraph.
# Communication operations in another stream might cause unknown errors.
shared_out = self._shared_experts(hidden_states)
if self._shared_experts is None:
shared_out = None
else:
shared_out = self._shared_experts(hidden_states)

fused_output = AscendFusedMoE.forward_impl(
self,
Expand All @@ -490,6 +501,9 @@ def forward_impl(self, hidden_states: torch.Tensor,
forward_context = get_forward_context()
moe_comm_type = forward_context.moe_comm_type
if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2} \
and not shared_expert_dp_enabled():
and not shared_expert_dp_enabled() and shared_out is not None:
shared_out = tensor_model_parallel_all_reduce(shared_out)
return shared_out, fused_output
if shared_out is None:
return fused_output
else:
return shared_out, fused_output
4 changes: 3 additions & 1 deletion vllm_ascend/ops/fused_moe/moe_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,16 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
if quantized_hidden_states is not None:
dispose_tensor(quantized_hidden_states)
# act_fn: swiglu
group_diff = torch.diff(group_list)
new_group = torch.cat([group_diff[0].unsqueeze(0), group_diff], dim=0)
Comment on lines +115 to +116
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The calculation of new_group from group_list (which appears to be a cumulative sum of group sizes) is incorrect and can lead to runtime errors or incorrect behavior.

  1. If group_list contains only one element, torch.diff(group_list) will be empty, and group_diff[0] will raise an IndexError.
  2. If group_list is [g1, g1+g2, ...], group_diff will be [g2, g3, ...]. The current logic torch.cat([group_diff[0].unsqueeze(0), group_diff], dim=0) would produce [g2, g2, g3, ...], which is incorrect as the first group size should be g1.

The correct way to get group sizes from a cumulative sum tensor is to use torch.diff with the first element of group_list prepended.

Suggested change
group_diff = torch.diff(group_list)
new_group = torch.cat([group_diff[0].unsqueeze(0), group_diff], dim=0)
new_group = torch.cat((group_list[0:1], torch.diff(group_list)))

hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
x=hidden_states,
weight_scale=w1_scale,
activation_scale=pertoken_scale,
bias=None,
quant_scale=None,
quant_offset=None,
group_index=group_list,
group_index=new_group,
activate_left=True,
quant_mode=1,
)
Expand Down
1 change: 1 addition & 0 deletions vllm_ascend/patch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,4 @@
# Future Plan:
# Remove this patch when adapted vllm version contains the above PR.
#
from vllm_ascend.patch.worker import patch_deepseekv3

Check failure on line 141 in vllm_ascend/patch/__init__.py

View workflow job for this annotation

GitHub Actions / lint / pre-commit

Ruff (F401)

vllm_ascend/patch/__init__.py:141:38: F401 `vllm_ascend.patch.worker.patch_deepseekv3` imported but unused; consider removing, adding to `__all__`, or using a redundant alias
Loading
Loading