diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp index 30b9b4e3..4fb98204 100644 --- a/xllm/core/common/global_flags.cpp +++ b/xllm/core/common/global_flags.cpp @@ -389,3 +389,10 @@ DEFINE_string(reasoning_parser, // --- qwen3 reranker config --- DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker."); + +DEFINE_bool( + enable_prefetch_weight, + false, + "Whether to enable prefetch weight,only applicable to Qwen3-dense model." + "The default prefetching ratio for gateup weight is 40%." + "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5"); diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h index 5c79a7c3..7fc36442 100644 --- a/xllm/core/common/global_flags.h +++ b/xllm/core/common/global_flags.h @@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker); DECLARE_string(reasoning_parser); DECLARE_bool(enable_shm); + +DECLARE_bool(enable_prefetch_weight); diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp index 0067e38a..d48379fb 100644 --- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp @@ -89,11 +89,15 @@ enum DecoderLayerTensorId : int { IN_MLP_CPROJ_SCALE = 48, // scale IN_MLP_CPROJ_COMPRESS_IDX = 49, - Q_NORM_WEIGHT = 50, - K_NORM_WEIGHT = 51, + IN_QKV_SCALE_FILL = 50, + IN_QKV_OFFSET_FILL = 51, + IN_MLP_SCALE_FILL = 52, + IN_MLP_OFFSET_FILL = 53, + Q_NORM_WEIGHT = 54, + K_NORM_WEIGHT = 55, }; -const uint64_t WEIGHT_COUNT_PER_LAYER = 52; +const uint64_t WEIGHT_COUNT_PER_LAYER = 56; static std::vector> WEIGHT_MAPPING = { {IN_NORM_WEIGHT, "input_layernorm.weight"}, @@ -207,11 +211,16 @@ void NpuQwen3DecoderLayerImpl::param_from_args( param.useQKNorm = true; param.numHiddenLayers = args.n_layers(); - + param.enableIntraLayerAddNorm = true; + param.enableInterLayerAddNorm = false; + param.enablePreFetchWeight = FLAGS_enable_prefetch_weight; initialize_quantization_parameters(param); if (isPrefill) { - param.enableAclnnRmsNorm = quantize_type_.empty(); + param.enableAclnnRmsNorm = + param.enableIntraLayerAddNorm && quantize_type_.empty() + ? false + : quantize_type_.empty(); // for prefix cache without chunked prefill. if (FLAGS_enable_prefix_cache && !FLAGS_enable_chunked_prefill && FLAGS_block_size != 128) { @@ -383,6 +392,38 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() { at_weight_tensors_[idx] = at_placeholder_; } + if (prefill_param_.enableIntraLayerAddNorm || + prefill_param_.enableInterLayerAddNorm) { + if (quantize_type_.compare("w8a8") == 0) { + // quantize + torch::ScalarType weight_fill_dtype = torch::kBFloat16; + int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1); + int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1); + at_weight_tensors_[IN_QKV_SCALE_FILL] = at_weight_tensors_[IN_Q_SCALE] + .repeat(weight_attn_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_MLP_SCALE_FILL] = + at_weight_tensors_[IN_MLP_W2_SCALE] + .repeat(weight_mlp_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_QKV_OFFSET_FILL] = at_weight_tensors_[IN_Q_OFFSET] + .repeat(weight_attn_shape) + .to(weight_fill_dtype); + at_weight_tensors_[IN_MLP_OFFSET_FILL] = + at_weight_tensors_[IN_MLP_W2_OFFSET] + .repeat(weight_mlp_shape) + .to(weight_fill_dtype); + } else { + // bfloat16 or float16 + for (auto idx : {IN_QKV_SCALE_FILL, + IN_QKV_OFFSET_FILL, + IN_MLP_SCALE_FILL, + IN_MLP_OFFSET_FILL}) { + at_weight_tensors_[idx] = at_placeholder_; + } + } + } + c10_npu::NPUCachingAllocator::emptyCache(); for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) { atb_weight_tensors_[i] =