diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
index 30b9b4e3..4fb98204 100644
--- a/xllm/core/common/global_flags.cpp
+++ b/xllm/core/common/global_flags.cpp
@@ -389,3 +389,10 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(
+    enable_prefetch_weight,
+    false,
+    "Whether to enable prefetch weight,only applicable to Qwen3-dense model."
+    "The default prefetching ratio for gateup weight is 40%."
+    "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
index 5c79a7c3..7fc36442 100644
--- a/xllm/core/common/global_flags.h
+++ b/xllm/core/common/global_flags.h
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_prefetch_weight);
diff --git a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
index 0067e38a..d48379fb 100644
--- a/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_decoder_layer_impl.cpp
@@ -89,11 +89,15 @@ enum DecoderLayerTensorId : int {
   IN_MLP_CPROJ_SCALE = 48,     // scale
   IN_MLP_CPROJ_COMPRESS_IDX = 49,
 
-  Q_NORM_WEIGHT = 50,
-  K_NORM_WEIGHT = 51,
+  IN_QKV_SCALE_FILL = 50,
+  IN_QKV_OFFSET_FILL = 51,
+  IN_MLP_SCALE_FILL = 52,
+  IN_MLP_OFFSET_FILL = 53,
+  Q_NORM_WEIGHT = 54,
+  K_NORM_WEIGHT = 55,
 };
 
-const uint64_t WEIGHT_COUNT_PER_LAYER = 52;
+const uint64_t WEIGHT_COUNT_PER_LAYER = 56;
 
 static std::vector<std::pair<int, std::string>> WEIGHT_MAPPING = {
     {IN_NORM_WEIGHT, "input_layernorm.weight"},
@@ -207,11 +211,16 @@ void NpuQwen3DecoderLayerImpl::param_from_args(
   param.useQKNorm = true;
 
   param.numHiddenLayers = args.n_layers();
-
+  param.enableIntraLayerAddNorm = true;
+  param.enableInterLayerAddNorm = false;
+  param.enablePreFetchWeight = FLAGS_enable_prefetch_weight;
   initialize_quantization_parameters(param);
 
   if (isPrefill) {
-    param.enableAclnnRmsNorm = quantize_type_.empty();
+    param.enableAclnnRmsNorm =
+        param.enableIntraLayerAddNorm && quantize_type_.empty()
+            ? false
+            : quantize_type_.empty();
     // for prefix cache without chunked prefill.
     if (FLAGS_enable_prefix_cache && !FLAGS_enable_chunked_prefill &&
         FLAGS_block_size != 128) {
@@ -383,6 +392,38 @@ void NpuQwen3DecoderLayerImpl::merge_loaded_weights() {
     at_weight_tensors_[idx] = at_placeholder_;
   }
 
+  if (prefill_param_.enableIntraLayerAddNorm ||
+      prefill_param_.enableInterLayerAddNorm) {
+    if (quantize_type_.compare("w8a8") == 0) {
+      // quantize
+      torch::ScalarType weight_fill_dtype = torch::kBFloat16;
+      int64_t weight_attn_shape = at_weight_tensors_[IN_Q_WEIGHT].size(-1);
+      int64_t weight_mlp_shape = at_weight_tensors_[IN_MLP_W2_WEIGHT].size(-1);
+      at_weight_tensors_[IN_QKV_SCALE_FILL] = at_weight_tensors_[IN_Q_SCALE]
+                                                  .repeat(weight_attn_shape)
+                                                  .to(weight_fill_dtype);
+      at_weight_tensors_[IN_MLP_SCALE_FILL] =
+          at_weight_tensors_[IN_MLP_W2_SCALE]
+              .repeat(weight_mlp_shape)
+              .to(weight_fill_dtype);
+      at_weight_tensors_[IN_QKV_OFFSET_FILL] = at_weight_tensors_[IN_Q_OFFSET]
+                                                   .repeat(weight_attn_shape)
+                                                   .to(weight_fill_dtype);
+      at_weight_tensors_[IN_MLP_OFFSET_FILL] =
+          at_weight_tensors_[IN_MLP_W2_OFFSET]
+              .repeat(weight_mlp_shape)
+              .to(weight_fill_dtype);
+    } else {
+      // bfloat16 or float16
+      for (auto idx : {IN_QKV_SCALE_FILL,
+                       IN_QKV_OFFSET_FILL,
+                       IN_MLP_SCALE_FILL,
+                       IN_MLP_OFFSET_FILL}) {
+        at_weight_tensors_[idx] = at_placeholder_;
+      }
+    }
+  }
+
   c10_npu::NPUCachingAllocator::emptyCache();
   for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
     atb_weight_tensors_[i] =