From 0344d58e1408864f0970bfa3111d4bf3b2bad8f5 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Tue, 19 Aug 2025 09:31:18 +0000
Subject: [PATCH 1/4] [CANN] Optimize RMS_NORM using cache

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 138 ++++++++++++++++++++++++-------
 ggml/src/ggml-cann/common.h      |   3 +
 2 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 2a5cb8abfa137..507aa2912da06 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -867,6 +867,98 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
     return acl_tensor;
 }
 
+/**
+ * @brief Fills a tensor with a scalar value.
+ *
+ * This function fills the destination tensor `acl_dst` with the scalar value
+ * `scalar`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param scalar The scalar value used to fill the tensor.
+ * @param acl_dst The destination tensor to be filled with the scalar value.
+ */
+static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
+                              aclTensor* acl_dst) {
+    auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
+    ggml_cann_release_resources(ctx, acl_scalar);
+}
+
+/**
+ * @brief Get or expand cached float32 tensors filled with scalar values.
+ * 
+ * This function manages a cache of float32 tensors (zero-filled and one-filled).  
+ * If the cache does not exist, it will initialize the cache with a zero tensor  
+ * and a one tensor. If the requested tensor size exceeds the current cache  
+ * capacity, the cache will be expanded accordingly. The function then returns  
+ * an aclTensor created from the cached memory (either zero-filled or one-filled),  
+ * depending on the input `value`.
+ *
+ * @param ctx   The CANN backend context that manages cache memory.
+ * @param ne    The tensor shape array (number of elements in each dimension).
+ * @param nb    The stride size for each dimension.
+ * @param dims  The number of tensor dimensions.
+ * @param value The scalar value (only supports 0 or 1) used to determine whether
+ *              to return the zero-cache tensor or the one-cache tensor.
+ * @return      An aclTensor pointer corresponding to the cached tensor.
+ */
+static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, 
+                                        int64_t* ne, size_t* nb,
+                                        int64_t dims, int64_t value) {
+    // init cache
+    if(ctx.f32_zero_cache == nullptr) {
+        // zero-cache pool init
+        size_t size = ctx.f32_cache_element * sizeof(float);
+        ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
+        
+        // one-cache pool init
+        int64_t pool_ne[1] = { ctx.f32_cache_element }; 
+        size_t pool_nb[1] = { sizeof(float) }; 
+        ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        aclTensor* acl_one = ggml_cann_create_tensor(
+            ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
+            1);
+        aclnn_fill_scalar(ctx, 1, acl_one);
+        ggml_cann_release_resources(ctx, acl_one);
+    }
+
+    // Cache expansion
+    int64_t n_element = 1;
+    for(int i = 0; i < dims; i++) {
+        n_element = n_element * ne[i];
+    }
+    if (ctx.f32_cache_element < n_element) {
+        // free old mem
+        aclrtFree(ctx.f32_zero_cache);
+        aclrtFree(ctx.f32_one_cache);
+        // init zero cache
+        ctx.f32_cache_element = n_element;
+        size_t size = n_element * sizeof(float);
+        ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
+        
+        // one-cache pool init
+        int64_t pool_ne[1] = { n_element }; 
+        size_t pool_nb[1] = { sizeof(float) }; 
+        ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        aclTensor* acl_one = ggml_cann_create_tensor(
+            ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
+            1);
+        aclnn_fill_scalar(ctx, 1, acl_one);
+        ggml_cann_release_resources(ctx, acl_one);
+    }
+    
+    void* cache;
+    if (value == 0) {
+        cache = ctx.f32_zero_cache;
+    } else {
+        cache = ctx.f32_one_cache;
+    }
+    
+    return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims);
+}
+
 void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
 
@@ -875,20 +967,23 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
-    size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
-    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
-    aclTensor* acl_gamma = aclnn_values(
-        ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
-        ggml_cann_type_mapping(src->type), ggml_element_size(src));
-
-    size_t zero_tensor_n_bytes =
-        src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
-    ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
-    aclTensor* acl_rstd =
-        aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
-                   src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
-                   ggml_element_size(src));
+    // build gamma, one...
+    size_t acl_gamma_nb[GGML_MAX_DIMS];
+    acl_gamma_nb[0] = sizeof(float);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
+    }
+    aclTensor* acl_gamma = get_f32_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1);
+
+    // build rstd, zero...
+    size_t acl_rstd_nb[GGML_MAX_DIMS];
+    acl_rstd_nb[0] = sizeof(float);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
+    }
+    aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0);
+    
     GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
     ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
 }
@@ -1277,23 +1372,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
         tmp_permute_tensor, tmp_mul_tensor, acl_dst);
 }
 
-/**
- * @brief Fills a tensor with a scalar value.
- *
- * This function fills the destination tensor `acl_dst` with the scalar value
- * `scalar`.
- *
- * @param ctx The context for the CANN backend operations.
- * @param scalar The scalar value used to fill the tensor.
- * @param acl_dst The destination tensor to be filled with the scalar value.
- */
-static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
-                              aclTensor* acl_dst) {
-    auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
-    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
-    ggml_cann_release_resources(ctx, acl_scalar);
-}
-
 /**
  * @brief Raises each element of a tensor to the power of the corresponding
  * element in another tensor.
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index 2c2033bfba857..f61ea9e5db20e 100755
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -379,6 +379,9 @@ struct ggml_backend_cann_context {
     cann_task_queue task_queue;
     bool async_mode;
     bool support_set_rows;
+    void* f32_zero_cache = nullptr;
+    void* f32_one_cache = nullptr;
+    int64_t f32_cache_element = 1024 * 1024;
 
     aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 

From 9b0ec0ead7e966a8f9b3ab315c2e7c505eedc48b Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Tue, 19 Aug 2025 09:42:02 +0000
Subject: [PATCH 2/4] fix typo

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 507aa2912da06..80e9950c6611c 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -886,12 +886,12 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
 
 /**
  * @brief Get or expand cached float32 tensors filled with scalar values.
- * 
- * This function manages a cache of float32 tensors (zero-filled and one-filled).  
- * If the cache does not exist, it will initialize the cache with a zero tensor  
- * and a one tensor. If the requested tensor size exceeds the current cache  
- * capacity, the cache will be expanded accordingly. The function then returns  
- * an aclTensor created from the cached memory (either zero-filled or one-filled),  
+ *
+ * This function manages a cache of float32 tensors (zero-filled and one-filled).
+ * If the cache does not exist, it will initialize the cache with a zero tensor
+ * and a one tensor. If the requested tensor size exceeds the current cache
+ * capacity, the cache will be expanded accordingly. The function then returns
+ * an aclTensor created from the cached memory (either zero-filled or one-filled),
  * depending on the input `value`.
  *
  * @param ctx   The CANN backend context that manages cache memory.
@@ -902,7 +902,7 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
  *              to return the zero-cache tensor or the one-cache tensor.
  * @return      An aclTensor pointer corresponding to the cached tensor.
  */
-static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, 
+static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx,
                                         int64_t* ne, size_t* nb,
                                         int64_t dims, int64_t value) {
     // init cache
@@ -911,10 +911,10 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx,
         size_t size = ctx.f32_cache_element * sizeof(float);
         ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
         ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
-        
+
         // one-cache pool init
-        int64_t pool_ne[1] = { ctx.f32_cache_element }; 
-        size_t pool_nb[1] = { sizeof(float) }; 
+        int64_t pool_ne[1] = { ctx.f32_cache_element };
+        size_t pool_nb[1] = { sizeof(float) };
         ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
         aclTensor* acl_one = ggml_cann_create_tensor(
             ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
@@ -937,10 +937,10 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx,
         size_t size = n_element * sizeof(float);
         ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
         ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
-        
+
         // one-cache pool init
-        int64_t pool_ne[1] = { n_element }; 
-        size_t pool_nb[1] = { sizeof(float) }; 
+        int64_t pool_ne[1] = { n_element };
+        size_t pool_nb[1] = { sizeof(float) };
         ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
         aclTensor* acl_one = ggml_cann_create_tensor(
             ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
@@ -948,14 +948,14 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx,
         aclnn_fill_scalar(ctx, 1, acl_one);
         ggml_cann_release_resources(ctx, acl_one);
     }
-    
+
     void* cache;
     if (value == 0) {
         cache = ctx.f32_zero_cache;
     } else {
         cache = ctx.f32_one_cache;
     }
-    
+
     return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims);
 }
 
@@ -983,7 +983,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
     }
     aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0);
-    
+
     GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
     ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
 }

From c24b995a1ba1bed308cb5df761ca12036a4a3c33 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Wed, 20 Aug 2025 03:07:35 +0000
Subject: [PATCH 3/4] fix review comment

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 90 +++++++++++++++-----------------
 ggml/src/ggml-cann/common.h      |  3 +-
 2 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 80e9950c6611c..ef80236f64818 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -902,57 +902,50 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
  *              to return the zero-cache tensor or the one-cache tensor.
  * @return      An aclTensor pointer corresponding to the cached tensor.
  */
-static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx,
+static aclTensor* get_f32_zero_or_one_cache_acl_tensor(ggml_backend_cann_context& ctx,
                                         int64_t* ne, size_t* nb,
                                         int64_t dims, int64_t value) {
-    // init cache
-    if(ctx.f32_zero_cache == nullptr) {
-        // zero-cache pool init
-        size_t size = ctx.f32_cache_element * sizeof(float);
-        ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
-
-        // one-cache pool init
-        int64_t pool_ne[1] = { ctx.f32_cache_element };
-        size_t pool_nb[1] = { sizeof(float) };
-        ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        aclTensor* acl_one = ggml_cann_create_tensor(
-            ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
-            1);
-        aclnn_fill_scalar(ctx, 1, acl_one);
-        ggml_cann_release_resources(ctx, acl_one);
-    }
+    // just support one and zero cache
+    GGML_ASSERT(value == 1 || value == 0);
 
-    // Cache expansion
+    // Cache init or expansion
     int64_t n_element = 1;
     for(int i = 0; i < dims; i++) {
         n_element = n_element * ne[i];
     }
-    if (ctx.f32_cache_element < n_element) {
-        // free old mem
-        aclrtFree(ctx.f32_zero_cache);
-        aclrtFree(ctx.f32_one_cache);
-        // init zero cache
-        ctx.f32_cache_element = n_element;
-        size_t size = n_element * sizeof(float);
-        ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
-
-        // one-cache pool init
-        int64_t pool_ne[1] = { n_element };
-        size_t pool_nb[1] = { sizeof(float) };
-        ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        aclTensor* acl_one = ggml_cann_create_tensor(
-            ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
-            1);
-        aclnn_fill_scalar(ctx, 1, acl_one);
-        ggml_cann_release_resources(ctx, acl_one);
-    }
-
+    size_t size = n_element * sizeof(float);
     void* cache;
     if (value == 0) {
+        if(ctx.f32_zero_cache_element < n_element){
+            //free old mem
+            if(ctx.f32_zero_cache != nullptr){
+                aclrtFree(ctx.f32_zero_cache);
+            }
+            
+            //init zero cache
+            ctx.f32_zero_cache_element = n_element;
+            ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+            ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
+        }
         cache = ctx.f32_zero_cache;
     } else {
+        if(ctx.f32_one_cache_element < n_element){
+            //free old mem
+            if(ctx.f32_one_cache != nullptr){
+                aclrtFree(ctx.f32_one_cache);
+            }
+            
+            //init one cache
+            ctx.f32_one_cache_element = n_element;
+            int64_t pool_ne[1] = { n_element };
+            size_t pool_nb[1] = { sizeof(float) };
+            ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
+            aclTensor* acl_one = ggml_cann_create_tensor(
+                ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
+                1);
+            aclnn_fill_scalar(ctx, 1, acl_one);
+            ggml_cann_release_resources(ctx, acl_one);
+        }
         cache = ctx.f32_one_cache;
     }
 
@@ -974,7 +967,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
     }
-    aclTensor* acl_gamma = get_f32_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1);
+    aclTensor* acl_gamma = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1);
 
     // build rstd, zero...
     size_t acl_rstd_nb[GGML_MAX_DIMS];
@@ -982,7 +975,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
     }
-    aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0);
+    aclTensor* acl_rstd = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0);
 
     GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
     ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
@@ -998,14 +991,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 
     const int n_past = ((int32_t*)dst->op_params)[0];
 
-    size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
-                                src->ne[3] * ggml_element_size(src);
-    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
+    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
+    void* buffer = one_tensor_allocator.get();
 
-    aclTensor* mask_tensor =
-        aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
-                     src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
-                     ggml_element_size(src), value);
+    aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
+        ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
+    
+    aclnn_fill_scalar(ctx, value, mask_tensor);
 
     aclScalar* alpha = nullptr;
     float alphaValue = 1.0f;
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index f61ea9e5db20e..5858bd3f6a197 100755
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -381,7 +381,8 @@ struct ggml_backend_cann_context {
     bool support_set_rows;
     void* f32_zero_cache = nullptr;
     void* f32_one_cache = nullptr;
-    int64_t f32_cache_element = 1024 * 1024;
+    int64_t f32_zero_cache_element = 0;
+    int64_t f32_one_cache_element = 0;
 
     aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 

From 3c87db443ba6292af7c1f6145e2b9ab0219f56c2 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Wed, 20 Aug 2025 07:02:38 +0000
Subject: [PATCH 4/4] codestyle adjustment

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 127 +++++++++++++++++--------------
 1 file changed, 69 insertions(+), 58 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index ef80236f64818..8f65904b8fe51 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -885,71 +885,66 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
 }
 
 /**
- * @brief Get or expand cached float32 tensors filled with scalar values.
- *
- * This function manages a cache of float32 tensors (zero-filled and one-filled).
- * If the cache does not exist, it will initialize the cache with a zero tensor
- * and a one tensor. If the requested tensor size exceeds the current cache
- * capacity, the cache will be expanded accordingly. The function then returns
- * an aclTensor created from the cached memory (either zero-filled or one-filled),
- * depending on the input `value`.
- *
- * @param ctx   The CANN backend context that manages cache memory.
- * @param ne    The tensor shape array (number of elements in each dimension).
- * @param nb    The stride size for each dimension.
- * @param dims  The number of tensor dimensions.
- * @param value The scalar value (only supports 0 or 1) used to determine whether
- *              to return the zero-cache tensor or the one-cache tensor.
- * @return      An aclTensor pointer corresponding to the cached tensor.
+ * @brief Get or expand a cached float32 tensor filled with a scalar value.
+ *
+ * This function manages cached device memory for float32 tensors. If the current
+ * cache size is insufficient for the requested tensor shape, the old memory will
+ * be released and new memory will be allocated. The allocated buffer is then
+ * initialized either with zeros (when @p value == 0.0f) or with the given scalar
+ * value using CANN operations. Finally, an aclTensor object is created from the
+ * cached memory and returned.
+ *
+ * @param ctx           The CANN backend context that manages device memory.
+ * @param buffer        A pointer to the cached device buffer (will be allocated
+ *                      or reallocated if necessary).
+ * @param cache_element The current number of cached elements. This will be
+ *                      updated when the cache is expanded.
+ * @param ne            The tensor shape array (number of elements in each dimension).
+ * @param nb            The stride size for each dimension.
+ * @param dims          The number of tensor dimensions.
+ * @param value         The scalar value used to fill the tensor (supports zero
+ *                      initialization via memset or arbitrary values via fill_scalar).
+ * @return              An aclTensor pointer created from the cached buffer.
  */
-static aclTensor* get_f32_zero_or_one_cache_acl_tensor(ggml_backend_cann_context& ctx,
-                                        int64_t* ne, size_t* nb,
-                                        int64_t dims, int64_t value) {
-    // just support one and zero cache
-    GGML_ASSERT(value == 1 || value == 0);
-
-    // Cache init or expansion
+static aclTensor* get_f32_cache_acl_tensor(
+    ggml_backend_cann_context& ctx,
+    void** buffer,
+    int64_t &cache_element,
+    int64_t* ne,
+    size_t* nb,
+    int64_t dims,
+    float value) {
+    // Calculate total number of elements
     int64_t n_element = 1;
-    for(int i = 0; i < dims; i++) {
-        n_element = n_element * ne[i];
+    for (int i = 0; i < dims; i++) {
+        n_element *= ne[i];
     }
     size_t size = n_element * sizeof(float);
-    void* cache;
-    if (value == 0) {
-        if(ctx.f32_zero_cache_element < n_element){
-            //free old mem
-            if(ctx.f32_zero_cache != nullptr){
-                aclrtFree(ctx.f32_zero_cache);
-            }
-            
-            //init zero cache
-            ctx.f32_zero_cache_element = n_element;
-            ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-            ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream()));
+
+    // Allocate or expand cache if needed
+    if (cache_element < n_element) {
+        if (*buffer != nullptr) {
+            aclrtFree(*buffer);
+            *buffer = nullptr;
         }
-        cache = ctx.f32_zero_cache;
-    } else {
-        if(ctx.f32_one_cache_element < n_element){
-            //free old mem
-            if(ctx.f32_one_cache != nullptr){
-                aclrtFree(ctx.f32_one_cache);
-            }
-            
-            //init one cache
-            ctx.f32_one_cache_element = n_element;
+
+        ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        cache_element = n_element;
+
+        // Initialize cache
+        if (value == 0.0f) {
+            ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
+        } else {
             int64_t pool_ne[1] = { n_element };
             size_t pool_nb[1] = { sizeof(float) };
-            ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST));
-            aclTensor* acl_one = ggml_cann_create_tensor(
-                ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb,
-                1);
-            aclnn_fill_scalar(ctx, 1, acl_one);
-            ggml_cann_release_resources(ctx, acl_one);
+            aclTensor* acl_value = ggml_cann_create_tensor(
+                *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
+            aclnn_fill_scalar(ctx, 1, acl_value);
+            ggml_cann_release_resources(ctx, acl_value);
         }
-        cache = ctx.f32_one_cache;
     }
 
-    return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims);
+    return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
 }
 
 void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -967,7 +962,15 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
     }
-    aclTensor* acl_gamma = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1);
+    aclTensor* acl_gamma = get_f32_cache_acl_tensor(
+        ctx,
+        &ctx.f32_one_cache,
+        ctx.f32_one_cache_element,
+        src->ne,
+        acl_gamma_nb,
+        1,        // dims
+        1.0f      // value
+    );
 
     // build rstd, zero...
     size_t acl_rstd_nb[GGML_MAX_DIMS];
@@ -975,7 +978,15 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
     }
-    aclTensor* acl_rstd = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0);
+    aclTensor* acl_rstd = get_f32_cache_acl_tensor(
+        ctx,
+        &ctx.f32_zero_cache,
+        ctx.f32_zero_cache_element,
+        src->ne,
+        acl_rstd_nb,
+        GGML_MAX_DIMS,
+        0.0f      // value
+    );
 
     GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
     ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
@@ -996,7 +1007,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 
     aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
         ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
-    
+
     aclnn_fill_scalar(ctx, value, mask_tensor);
 
     aclScalar* alpha = nullptr;