From bfc7e6d685c80024dd522cab828bd7642a87f399 Mon Sep 17 00:00:00 2001 From: "ext.yanwei25" Date: Wed, 29 Oct 2025 20:24:10 +0800 Subject: [PATCH 1/3] feat: qwen3 dense support PreFetchWeight and IntraAddNorm --- xllm/core/common/global_flags.cpp | 4 ++ xllm/core/common/global_flags.h | 2 + .../npu/npu_qwen3_decoder_layer_impl.cpp | 51 +++++++++++++++++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp index 30b9b4e3..516e762f 100644 --- a/xllm/core/common/global_flags.cpp +++ b/xllm/core/common/global_flags.cpp @@ -389,3 +389,7 @@ DEFINE_string(reasoning_parser, // --- qwen3 reranker config --- DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker."); + +DEFINE_bool(enable_prefetch_weight, + false, + "Whether to enable prefetch weight."); diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h index 5c79a7c3..7fc36442 100644 --- a/xllm/core/common/global_flags.h +++ b/xllm/core/common/global_flags.h @@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker); DECLARE_string(reasoning_parser); DECLARE_bool(enable_shm); + +DECLARE_bool(enable_prefetch_weight); diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp index 0067e38a..3a219cec 100644 --- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp @@ -89,11 +89,15 @@ enum DecoderLayerTensorId : int { IN_MLP_CPROJ_SCALE = 48, // scale IN_MLP_CPROJ_COMPRESS_IDX = 49, - Q_NORM_WEIGHT = 50, - K_NORM_WEIGHT = 51, + IN_QKV_SCALE_FILL = 50, + IN_QKV_OFFSET_FILL = 51, + IN_MLP_SCALE_FILL = 52, + IN_MLP_OFFSET_FILL = 53, + Q_NORM_WEIGHT = 54, + K_NORM_WEIGHT = 55, }; -const uint64_t WEIGHT_COUNT_PER_LAYER = 52; +const uint64_t WEIGHT_COUNT_PER_LAYER = 56; static std::vector> WEIGHT_MAPPING = { {IN_NORM_WEIGHT, "input_layernorm.weight"}, @@ -207,11 +211,16 @@ void NpuQwen3DecoderLayerImpl::param_from_args( param.useQKNorm = true; param.numHiddenLayers = args.n_layers(); - + param.enableIntraLayerAddNorm = true; + param.enableInterLayerAddNorm = false; + param.enablePreFetchWeight = FLAGS_enable_prefetch_weight; initialize_quantization_parameters(param); if (isPrefill) { - param.enableAclnnRmsNorm = quantize_type_.empty(); + param.enableAclnnRmsNorm = + param.enableIntraLayerAddNorm && quantize_type_.empty() + ? false + : quantize_type_.empty(); // for prefix cache without chunked prefill. if (FLAGS_enable_prefix_cache && !FLAGS_enable_chunked_prefill && FLAGS_block_size != 128) { @@ -383,6 +392,38 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() { at_weight_tensors_[idx] = at_placeholder_; } + if (prefill_param_.enableIntraLayerAddNorm || + prefill_param_.enableInterLayerAddNorm) { + if (quantize_type_.compare("w8a8") == 0) { + // 量化逻辑 + torch::ScalarType weight_fill_dtype = torch::kBFloat16; + int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1); + int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1); + at_weight_tensors_[IN_QKV_SCALE_FILL] = at_weight_tensors_[IN_Q_SCALE] + .repeat(weight_attn_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_MLP_SCALE_FILL] = + at_weight_tensors_[IN_MLP_W2_SCALE] + .repeat(weight_mlp_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_QKV_OFFSET_FILL] = at_weight_tensors_[IN_Q_OFFSET] + .repeat(weight_attn_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_MLP_OFFSET_FILL] = + at_weight_tensors_[IN_MLP_W2_OFFSET] + .repeat(weight_mlp_shape) + .to(weight_fill_dtype); + } else { + // 浮点逻辑 + for (auto idx : {IN_QKV_SCALE_FILL, + IN_QKV_OFFSET_FILL, + IN_MLP_SCALE_FILL, + IN_MLP_OFFSET_FILL}) { + at_weight_tensors_[idx] = at_placeholder_; + } + } + } + c10_npu::NPUCachingAllocator::emptyCache(); for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) { atb_weight_tensors_[i] = From 3416a4511b750f43105075a27900fcef85fc5b1d Mon Sep 17 00:00:00 2001 From: Panxuanyu Date: Thu, 30 Oct 2025 22:50:11 +0800 Subject: [PATCH 2/3] bugfix: modify comment in english --- xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp index 3a219cec..d48379fb 100644 --- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp @@ -395,7 +395,7 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() { if (prefill_param_.enableIntraLayerAddNorm || prefill_param_.enableInterLayerAddNorm) { if (quantize_type_.compare("w8a8") == 0) { - // 量化逻辑 + // quantize torch::ScalarType weight_fill_dtype = torch::kBFloat16; int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1); int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1); @@ -414,7 +414,7 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() { .repeat(weight_mlp_shape) .to(weight_fill_dtype); } else { - // 浮点逻辑 + // bfloat16 or float16 for (auto idx : {IN_QKV_SCALE_FILL, IN_QKV_OFFSET_FILL, IN_MLP_SCALE_FILL, From b6cd9cc2de2015c0bee6d00dd1994ba199166640 Mon Sep 17 00:00:00 2001 From: Panxuanyu Date: Tue, 4 Nov 2025 14:56:14 +0800 Subject: [PATCH 3/3] bugfix: modify parameter description for enable_prefetch_weight --- xllm/core/common/global_flags.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp index 516e762f..4fb98204 100644 --- a/xllm/core/common/global_flags.cpp +++ b/xllm/core/common/global_flags.cpp @@ -390,6 +390,9 @@ DEFINE_string(reasoning_parser, // --- qwen3 reranker config --- DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker."); -DEFINE_bool(enable_prefetch_weight, - false, - "Whether to enable prefetch weight."); +DEFINE_bool( + enable_prefetch_weight, + false, + "Whether to enable prefetch weight,only applicable to Qwen3-dense model." + "The default prefetching ratio for gateup weight is 40%." + "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5");