From bfc7e6d685c80024dd522cab828bd7642a87f399 Mon Sep 17 00:00:00 2001
From: "ext.yanwei25" <ext.yanwei25@jd.com>
Date: Wed, 29 Oct 2025 20:24:10 +0800
Subject: [PATCH 1/3] feat: qwen3 dense support PreFetchWeight and IntraAddNorm

---
 xllm/core/common/global_flags.cpp             |  4 ++
 xllm/core/common/global_flags.h               |  2 +
 .../npu/npu_qwen3_decoder_layer_impl.cpp      | 51 +++++++++++++++++--
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
index 30b9b4e3..516e762f 100644
--- a/xllm/core/common/global_flags.cpp
+++ b/xllm/core/common/global_flags.cpp
@@ -389,3 +389,7 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(enable_prefetch_weight,
+            false,
+            "Whether to enable prefetch weight.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
index 5c79a7c3..7fc36442 100644
--- a/xllm/core/common/global_flags.h
+++ b/xllm/core/common/global_flags.h
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_prefetch_weight);
diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
index 0067e38a..3a219cec 100644
--- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
@@ -89,11 +89,15 @@ enum DecoderLayerTensorId : int {
   IN_MLP_CPROJ_SCALE = 48,     // scale
   IN_MLP_CPROJ_COMPRESS_IDX = 49,
 
-  Q_NORM_WEIGHT = 50,
-  K_NORM_WEIGHT = 51,
+  IN_QKV_SCALE_FILL = 50,
+  IN_QKV_OFFSET_FILL = 51,
+  IN_MLP_SCALE_FILL = 52,
+  IN_MLP_OFFSET_FILL = 53,
+  Q_NORM_WEIGHT = 54,
+  K_NORM_WEIGHT = 55,
 };
 
-const uint64_t WEIGHT_COUNT_PER_LAYER = 52;
+const uint64_t WEIGHT_COUNT_PER_LAYER = 56;
 
 static std::vector<std::pair<int, std::string>> WEIGHT_MAPPING = {
     {IN_NORM_WEIGHT, "input_layernorm.weight"},
@@ -207,11 +211,16 @@ void NpuQwen3DecoderLayerImpl::param_from_args(
   param.useQKNorm = true;
 
   param.numHiddenLayers = args.n_layers();
-
+  param.enableIntraLayerAddNorm = true;
+  param.enableInterLayerAddNorm = false;
+  param.enablePreFetchWeight = FLAGS_enable_prefetch_weight;
   initialize_quantization_parameters(param);
 
   if (isPrefill) {
-    param.enableAclnnRmsNorm = quantize_type_.empty();
+    param.enableAclnnRmsNorm =
+        param.enableIntraLayerAddNorm && quantize_type_.empty()
+            ? false
+            : quantize_type_.empty();
     // for prefix cache without chunked prefill.
     if (FLAGS_enable_prefix_cache && !FLAGS_enable_chunked_prefill &&
         FLAGS_block_size != 128) {
@@ -383,6 +392,38 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() {
     at_weight_tensors_[idx] = at_placeholder_;
   }
 
+  if (prefill_param_.enableIntraLayerAddNorm ||
+      prefill_param_.enableInterLayerAddNorm) {
+    if (quantize_type_.compare("w8a8") == 0) {
+      // 量化逻辑
+      torch::ScalarType weight_fill_dtype = torch::kBFloat16;
+      int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1);
+      int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1);
+      at_weight_tensors_[IN_QKV_SCALE_FILL] = at_weight_tensors_[IN_Q_SCALE]
+                                                  .repeat(weight_attn_shape)
+                                                  .to(weight_fill_dtype);
+      at_weight_tensors_[IN_MLP_SCALE_FILL] =
+          at_weight_tensors_[IN_MLP_W2_SCALE]
+              .repeat(weight_mlp_shape)
+              .to(weight_fill_dtype);
+      at_weight_tensors_[IN_QKV_OFFSET_FILL] = at_weight_tensors_[IN_Q_OFFSET]
+                                                   .repeat(weight_attn_shape)
+                                                   .to(weight_fill_dtype);
+      at_weight_tensors_[IN_MLP_OFFSET_FILL] =
+          at_weight_tensors_[IN_MLP_W2_OFFSET]
+              .repeat(weight_mlp_shape)
+              .to(weight_fill_dtype);
+    } else {
+      // 浮点逻辑
+      for (auto idx : {IN_QKV_SCALE_FILL,
+                       IN_QKV_OFFSET_FILL,
+                       IN_MLP_SCALE_FILL,
+                       IN_MLP_OFFSET_FILL}) {
+        at_weight_tensors_[idx] = at_placeholder_;
+      }
+    }
+  }
+
   c10_npu::NPUCachingAllocator::emptyCache();
   for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
     atb_weight_tensors_[i] =

From 3416a4511b750f43105075a27900fcef85fc5b1d Mon Sep 17 00:00:00 2001
From: Panxuanyu <ext.panxuanyu1@jd.com>
Date: Thu, 30 Oct 2025 22:50:11 +0800
Subject: [PATCH 2/3] bugfix: modify comment in english

---
 xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
index 3a219cec..d48379fb 100644
--- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
@@ -395,7 +395,7 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() {
   if (prefill_param_.enableIntraLayerAddNorm ||
       prefill_param_.enableInterLayerAddNorm) {
     if (quantize_type_.compare("w8a8") == 0) {
-      // 量化逻辑
+      // quantize
       torch::ScalarType weight_fill_dtype = torch::kBFloat16;
       int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1);
       int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1);
@@ -414,7 +414,7 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() {
               .repeat(weight_mlp_shape)
               .to(weight_fill_dtype);
     } else {
-      // 浮点逻辑
+      // bfloat16 or float16
       for (auto idx : {IN_QKV_SCALE_FILL,
                        IN_QKV_OFFSET_FILL,
                        IN_MLP_SCALE_FILL,

From b6cd9cc2de2015c0bee6d00dd1994ba199166640 Mon Sep 17 00:00:00 2001
From: Panxuanyu <ext.panxuanyu1@jd.com>
Date: Tue, 4 Nov 2025 14:56:14 +0800
Subject: [PATCH 3/3] bugfix: modify parameter description for
 enable_prefetch_weight

---
 xllm/core/common/global_flags.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
index 516e762f..4fb98204 100644
--- a/xllm/core/common/global_flags.cpp
+++ b/xllm/core/common/global_flags.cpp
@@ -390,6 +390,9 @@ DEFINE_string(reasoning_parser,
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
 
-DEFINE_bool(enable_prefetch_weight,
-            false,
-            "Whether to enable prefetch weight.");
+DEFINE_bool(
+    enable_prefetch_weight,
+    false,
+    "Whether to enable prefetch weight,only applicable to Qwen3-dense model."
+    "The default prefetching ratio for gateup weight is 40%."
+    "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5");